Gbaraldi/threadtrace2 (#724)
* Added first ATT API * Finalizing thread trace API * Fixing more rebase conflicts * Added codeobj disassembly sample * Fixing merge issues with rebase [2] * Adding ATT packets * Implemented thread trace intercept * Moved codeobj parser to same repo as rocprofiler * Moved thread trace to new API * Fixing merge conflicts * Fixing more merge conflicts * Adding thread trace packet reuse * Merged aql_profile_v2 headers * Linked ATT sample to aqlprofile * Updated decoder to include non-loaded codeobjs * Implemented ISA decoder into ATT sample * Added marker_id to vaddr * Updating aql_profile_v2 API to memcpy * Updating thread trace API to include 64bit markers. Using the result of ISA matching. * Added instruction type and cycles summary * Updated sample with selection of kernel by kernel_object * Added option to copy from memory kernels * Moved tool_data in thread_trace to dynamic alloc * Restoring hsa.cpp * Fixed ATT sample crash. General improvements. * Moved codeobj library to outside src/ * Updated license header * Moved codeobj_capture to camelcase * Solving some more merge conflicts * Update samples/advanced_thread_trace/CMakeLists.txt Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * Update samples/advanced_thread_trace/CMakeLists.txt Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * Update samples/code_object_isa_decode/CMakeLists.txt Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * Update source/lib/rocprofiler-sdk/thread_trace/CMakeLists.txt * Removing unused parameter check * Adding const to isEmpty * Removing unused warning * Adding libdw-dev to requirements * Running clang-format * Commenting out new aql calls * Clang format * Unused variable fix * Adding codeobj-decoder coverage * Commenting out threadtrace * Update samples/CMakeLists.txt Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * P * WOverloaded * Addressing clang-tidy * Virtual destructor on ttracer class * Corr id * Fixing code source format * Update CMakeLists.txt * Build fixes * Update source/lib/rocprofiler-sdk-codeobj/code_object_track.cpp Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * Fix shadowing * Update CMakeLists.txt * Update samples/CMakeLists.txt Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Ammar ELWazir <ammar.elwazir@amd.com> Co-authored-by: Ammar ELWazir <aelwazir@amd.com> Co-authored-by: Benjamin Welton <bewelton@amd.com>
This commit is contained in:
کامیت شده توسط
GitHub
والد
d2bad14bf6
کامیت
69b8a43dc6
@@ -200,7 +200,7 @@ jobs:
|
||||
run: |
|
||||
git config --global --add safe.directory '*'
|
||||
apt-get update
|
||||
apt-get install -y cmake python3-pip gcovr wkhtmltopdf xvfb xfonts-base xfonts-75dpi xfonts-100dpi xfonts-utils xfonts-encodings libfontconfig
|
||||
apt-get install -y cmake python3-pip gcovr wkhtmltopdf xvfb xfonts-base xfonts-75dpi xfonts-100dpi xfonts-utils xfonts-encodings libfontconfig libdw-dev
|
||||
python3 -m pip install -r requirements.txt
|
||||
python3 -m pip install pytest pycobertura
|
||||
|
||||
@@ -388,7 +388,7 @@ jobs:
|
||||
python3 -m pip install pytest
|
||||
add-apt-repository ppa:ubuntu-toolchain-r/test
|
||||
apt-get update
|
||||
apt-get install -y g++-13
|
||||
apt-get install -y g++-13 libdw-dev
|
||||
update-alternatives --install $(which gcc) gcc $(which gcc-13) 100 --slave $(which g++) g++ $(which g++-13)
|
||||
realpath $(which gcc)
|
||||
realpath $(which g++)
|
||||
|
||||
@@ -29,3 +29,4 @@ add_subdirectory(api_buffered_tracing)
|
||||
add_subdirectory(code_object_tracing)
|
||||
add_subdirectory(counter_collection)
|
||||
add_subdirectory(intercept_table)
|
||||
# add_subdirectory(code_object_isa_decode) add_subdirectory(advanced_thread_trace)
|
||||
|
||||
@@ -0,0 +1,59 @@
|
||||
#
|
||||
#
|
||||
#
|
||||
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
|
||||
|
||||
if(NOT CMAKE_HIP_COMPILER)
|
||||
find_program(
|
||||
amdclangpp_EXECUTABLE
|
||||
NAMES amdclang++
|
||||
HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
|
||||
PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
|
||||
PATH_SUFFIXES bin llvm/bin NO_CACHE)
|
||||
mark_as_advanced(amdclangpp_EXECUTABLE)
|
||||
|
||||
if(amdclangpp_EXECUTABLE)
|
||||
set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
project(rocprofiler-sdk-samples-advanced-thread-trace LANGUAGES CXX HIP)
|
||||
|
||||
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
|
||||
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
|
||||
set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
find_package(rocprofiler-sdk REQUIRED)
|
||||
|
||||
add_library(advanced-thread-trace-client SHARED)
|
||||
target_sources(advanced-thread-trace-client PRIVATE client.cpp)
|
||||
target_link_libraries(
|
||||
advanced-thread-trace-client
|
||||
PRIVATE rocprofiler::rocprofiler rocprofiler::samples-build-flags
|
||||
rocprofiler-sdk-codeobj rocprofiler::samples-common-library)
|
||||
|
||||
set_source_files_properties(main.cpp PROPERTIES LANGUAGE HIP)
|
||||
set_source_files_properties(main.cpp PROPERTIES COMPILE_FLAGS "-g")
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
add_executable(advanced-thread-trace)
|
||||
target_sources(advanced-thread-trace PRIVATE main.cpp)
|
||||
target_link_libraries(
|
||||
advanced-thread-trace PRIVATE advanced-thread-trace-client Threads::Threads
|
||||
rocprofiler::samples-build-flags)
|
||||
|
||||
add_test(NAME advanced-thread-trace COMMAND $<TARGET_FILE:advanced-thread-trace>)
|
||||
|
||||
set_tests_properties(
|
||||
advanced-thread-trace
|
||||
PROPERTIES
|
||||
TIMEOUT
|
||||
45
|
||||
LABELS
|
||||
"samples"
|
||||
ENVIRONMENT
|
||||
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV};HSA_TOOLS_LIB=$<TARGET_FILE:rocprofiler::rocprofiler-shared-library>"
|
||||
FAIL_REGULAR_EXPRESSION
|
||||
"threw an exception")
|
||||
@@ -0,0 +1,578 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
//
|
||||
// undefine NDEBUG so asserts are implemented
|
||||
#ifdef NDEBUG
|
||||
# undef NDEBUG
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @file samples/code_object_isa_decode/client.cpp
|
||||
*
|
||||
* @brief Example rocprofiler client (tool)
|
||||
*/
|
||||
|
||||
#include <rocprofiler-sdk/buffer.h>
|
||||
#include <rocprofiler-sdk/callback_tracing.h>
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/registration.h>
|
||||
#include <rocprofiler-sdk/rocprofiler.h>
|
||||
#include "lib/rocprofiler-sdk/aql/aql_profile_v2.h"
|
||||
|
||||
#include "code_object_track.hpp"
|
||||
#include "common/defines.hpp"
|
||||
#include "common/filesystem.hpp"
|
||||
#include "lib/rocprofiler-sdk-codeobj/code_printing.hpp"
|
||||
|
||||
#include <cxxabi.h>
|
||||
#include <atomic>
|
||||
#include <cassert>
|
||||
#include <chrono>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
#include <regex>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#define TARGET_CU 1
|
||||
#define SIMD_SELECT 0x3
|
||||
#define BUFFER_SIZE 0x6000000
|
||||
#define NUM_SE 2
|
||||
constexpr bool COPY_MEMORY_CODEOBJ = false;
|
||||
|
||||
template <>
|
||||
struct std::hash<pcinfo_t>
|
||||
{
|
||||
uint64_t operator()(const pcinfo_t& info) const
|
||||
{
|
||||
return info.addr ^ (info.marker_id << 32ul) ^ (info.marker_id >> 32ul);
|
||||
}
|
||||
};
|
||||
|
||||
bool
|
||||
operator==(const pcinfo_t& a, const pcinfo_t& b)
|
||||
{
|
||||
return a.addr == b.addr && a.marker_id == b.marker_id;
|
||||
};
|
||||
|
||||
bool
|
||||
operator<(const pcinfo_t& a, const pcinfo_t& b)
|
||||
{
|
||||
if(a.marker_id == b.marker_id) return a.addr < b.addr;
|
||||
return a.marker_id < b.marker_id;
|
||||
};
|
||||
|
||||
namespace client
|
||||
{
|
||||
using code_obj_load_data_t = rocprofiler_callback_tracing_code_object_load_data_t;
|
||||
using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t;
|
||||
|
||||
rocprofiler_client_id_t* client_id = nullptr;
|
||||
rocprofiler_context_id_t client_ctx = {};
|
||||
|
||||
struct isa_map_elem_t
|
||||
{
|
||||
std::atomic<size_t> hitcount{0};
|
||||
std::atomic<size_t> latency{0};
|
||||
std::shared_ptr<Instruction> code_line{nullptr};
|
||||
};
|
||||
|
||||
struct ToolData
|
||||
{
|
||||
std::shared_mutex isa_map_mut;
|
||||
std::mutex output_mut;
|
||||
CodeobjAddressTranslate codeobjTranslate;
|
||||
std::map<pcinfo_t, std::unique_ptr<isa_map_elem_t>> isa_map;
|
||||
std::unordered_map<uint64_t, SymbolInfo> kernels_in_codeobj = {};
|
||||
std::unordered_map<uint64_t, std::string> kernel_object_to_kernel_name = {};
|
||||
std::stringstream output;
|
||||
|
||||
std::stringstream printKernel(uint64_t vaddr)
|
||||
{
|
||||
std::stringstream ss;
|
||||
try
|
||||
{
|
||||
ss << '\n' << std::hex;
|
||||
SymbolInfo& info = kernels_in_codeobj.at(vaddr);
|
||||
|
||||
ss << std::hex << "Found: " << info.name << " at addr: 0x" << vaddr << " with offset 0x"
|
||||
<< info.faddr << " vaddr 0x" << info.vaddr << std::dec << '\n';
|
||||
} catch(std::exception& e)
|
||||
{
|
||||
ss << e.what() << '\n';
|
||||
}
|
||||
return ss;
|
||||
}
|
||||
};
|
||||
|
||||
struct source_location
|
||||
{
|
||||
std::string function = {};
|
||||
std::string file = {};
|
||||
uint32_t line = 0;
|
||||
std::string context = {};
|
||||
};
|
||||
|
||||
struct trace_data_t
|
||||
{
|
||||
int64_t id;
|
||||
uint8_t* data;
|
||||
uint64_t size;
|
||||
ToolData* tool;
|
||||
};
|
||||
|
||||
std::atomic<int> TRACE_DATA_ID{-1};
|
||||
std::atomic<int> KERNEL_ADDR_ID{-1};
|
||||
std::atomic<int> OCCUPANCY_ID{-1};
|
||||
|
||||
void
|
||||
tool_codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record,
|
||||
rocprofiler_user_data_t* user_data,
|
||||
void* callback_data)
|
||||
{
|
||||
if(record.kind != ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT) return;
|
||||
if(record.phase != ROCPROFILER_CALLBACK_PHASE_LOAD) return;
|
||||
|
||||
assert(callback_data && "Shader callback passed null!");
|
||||
ToolData& tool = *reinterpret_cast<ToolData*>(callback_data);
|
||||
|
||||
if(record.operation == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER)
|
||||
{
|
||||
std::unique_lock<std::shared_mutex> lg(tool.isa_map_mut);
|
||||
auto* data = static_cast<kernel_symbol_data_t*>(record.payload);
|
||||
tool.kernel_object_to_kernel_name.emplace(data->kernel_object, data->kernel_name);
|
||||
}
|
||||
|
||||
if(record.operation != ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_LOAD) return;
|
||||
|
||||
std::unique_lock<std::shared_mutex> lg(tool.isa_map_mut);
|
||||
auto* data = static_cast<code_obj_load_data_t*>(record.payload);
|
||||
|
||||
if(std::string_view(data->uri).find("file:///") == 0)
|
||||
{
|
||||
tool.codeobjTranslate.addDecoder(
|
||||
data->uri, data->code_object_id, data->load_delta, data->load_size);
|
||||
auto symbolmap = tool.codeobjTranslate.getSymbolMap(data->code_object_id);
|
||||
for(auto& [vaddr, symbol] : symbolmap)
|
||||
tool.kernels_in_codeobj[vaddr] = symbol;
|
||||
}
|
||||
else if(COPY_MEMORY_CODEOBJ)
|
||||
{
|
||||
tool.codeobjTranslate.addDecoder(reinterpret_cast<const void*>(data->memory_base),
|
||||
data->memory_size,
|
||||
data->code_object_id,
|
||||
data->load_delta,
|
||||
data->load_size);
|
||||
auto symbolmap = tool.codeobjTranslate.getSymbolMap(data->code_object_id);
|
||||
for(auto& [vaddr, symbol] : symbolmap)
|
||||
tool.kernels_in_codeobj[vaddr] = symbol;
|
||||
}
|
||||
|
||||
(void) user_data;
|
||||
(void) callback_data;
|
||||
}
|
||||
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wunused-parameter"
|
||||
|
||||
rocprofiler_att_control_flags_t
|
||||
dispatch_callback(rocprofiler_queue_id_t queue_id,
|
||||
const rocprofiler_agent_t* agent,
|
||||
rocprofiler_correlation_id_t correlation_id,
|
||||
const hsa_kernel_dispatch_packet_t* dispatch_packet,
|
||||
uint64_t kernel_id,
|
||||
void* userdata)
|
||||
{
|
||||
assert(userdata && "Dispatch callback passed null!");
|
||||
ToolData& tool = *reinterpret_cast<ToolData*>(userdata);
|
||||
|
||||
std::shared_lock<std::shared_mutex> lg(tool.isa_map_mut);
|
||||
|
||||
constexpr int desired_call_iteration = 1;
|
||||
static std::atomic<int> call_id{0};
|
||||
static std::string_view desired_func_name = "transposeLdsSwapInplace";
|
||||
|
||||
try
|
||||
{
|
||||
auto& kernel_name = tool.kernel_object_to_kernel_name.at(dispatch_packet->kernel_object);
|
||||
if(kernel_name.find(desired_func_name) == std::string::npos)
|
||||
return ROCPROFILER_ATT_CONTROL_NONE;
|
||||
|
||||
if(call_id.fetch_add(1) == desired_call_iteration)
|
||||
return ROCPROFILER_ATT_CONTROL_START_AND_STOP;
|
||||
} catch(...)
|
||||
{
|
||||
std::cerr << "Could not find kernel object: " << dispatch_packet->kernel_object
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
return ROCPROFILER_ATT_CONTROL_NONE;
|
||||
}
|
||||
|
||||
void
|
||||
iterate_trace_types(int id, const char* metadata, void*)
|
||||
{
|
||||
if(std::string_view(metadata).find("occupancy") == 0)
|
||||
OCCUPANCY_ID.store(id);
|
||||
else if(std::string_view(metadata).find("kernel_ids_addr") == 0)
|
||||
KERNEL_ADDR_ID.store(id);
|
||||
else if(std::string_view(metadata).find("tracedata") == 0)
|
||||
TRACE_DATA_ID.store(id);
|
||||
}
|
||||
|
||||
hsa_status_t
|
||||
get_trace_data(int trace_type_id,
|
||||
int correlation_id,
|
||||
void* trace_events,
|
||||
uint64_t trace_size,
|
||||
void* userdata)
|
||||
{
|
||||
assert(userdata && "ISA callback passed null!");
|
||||
trace_data_t& trace_data = *reinterpret_cast<trace_data_t*>(userdata);
|
||||
assert(trace_data.tool && "ISA callback passed null!");
|
||||
ToolData& tool = *reinterpret_cast<ToolData*>(trace_data.tool);
|
||||
|
||||
std::stringstream ss;
|
||||
std::shared_lock<std::shared_mutex> shared_lock(tool.isa_map_mut);
|
||||
|
||||
if(trace_type_id == OCCUPANCY_ID)
|
||||
{
|
||||
ss << "Num waves: " << trace_size / 2 << '\n';
|
||||
// auto* occ = reinterpret_cast<att_occupancy_info_t*>(trace_events);
|
||||
}
|
||||
else if(trace_type_id == KERNEL_ADDR_ID)
|
||||
{
|
||||
ss << "Num KRN events: " << trace_size << std::hex << '\n';
|
||||
auto* kaddr = reinterpret_cast<pcinfo_t*>(trace_events);
|
||||
for(size_t i = 0; i < trace_size; i++)
|
||||
if(kaddr[i].addr != 0)
|
||||
{
|
||||
ss << " - ADDR: " << kaddr[i].addr << ' ' << tool.printKernel(kaddr[i].addr).str()
|
||||
<< '\n';
|
||||
}
|
||||
ss << std::dec;
|
||||
}
|
||||
else if(trace_type_id == TRACE_DATA_ID)
|
||||
{
|
||||
ss << "Trace Length: " << trace_size << '\n';
|
||||
auto* tracedata = reinterpret_cast<att_trace_event_t*>(trace_events);
|
||||
|
||||
for(size_t i = 0; i < trace_size; i++)
|
||||
{
|
||||
pcinfo_t pc = tracedata[i].pc;
|
||||
auto it = tool.isa_map.find(pc);
|
||||
if(it == tool.isa_map.end())
|
||||
{
|
||||
shared_lock.unlock();
|
||||
{
|
||||
std::unique_lock<std::shared_mutex> unique_lock(tool.isa_map_mut);
|
||||
auto ptr = std::make_unique<isa_map_elem_t>();
|
||||
ptr->code_line = tool.codeobjTranslate.get(pc.marker_id, pc.addr);
|
||||
it = tool.isa_map.emplace(pc, std::move(ptr)).first;
|
||||
}
|
||||
shared_lock.lock();
|
||||
}
|
||||
|
||||
it->second->hitcount.fetch_add(tracedata[i].hitcount, std::memory_order_relaxed);
|
||||
it->second->latency.fetch_add(tracedata[i].latency, std::memory_order_relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_lock<std::mutex> lk(tool.output_mut);
|
||||
tool.output << ss.str();
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
uint64_t
|
||||
copy_trace_data(int* seid, uint8_t** buffer, uint64_t* buffer_size, void* userdata)
|
||||
{
|
||||
trace_data_t& data = *reinterpret_cast<trace_data_t*>(userdata);
|
||||
*seid = data.id;
|
||||
*buffer_size = data.size;
|
||||
*buffer = data.data;
|
||||
data.size = 0;
|
||||
return *buffer_size;
|
||||
}
|
||||
|
||||
hsa_status_t
|
||||
isa_callback(char* isa_instruction,
|
||||
char* source_reference,
|
||||
uint64_t* isa_memory_size,
|
||||
uint64_t* isa_size,
|
||||
uint64_t* source_size,
|
||||
uint64_t marker_id,
|
||||
uint64_t offset,
|
||||
void* userdata)
|
||||
{
|
||||
assert(userdata && "ISA callback passed null!");
|
||||
trace_data_t& trace_data = *reinterpret_cast<trace_data_t*>(userdata);
|
||||
assert(trace_data.tool && "ISA callback passed null!");
|
||||
ToolData& tool = *reinterpret_cast<ToolData*>(trace_data.tool);
|
||||
|
||||
std::shared_ptr<Instruction> instruction;
|
||||
|
||||
{
|
||||
std::unique_lock<std::shared_mutex> unique_lock(tool.isa_map_mut);
|
||||
instruction = tool.codeobjTranslate.get(marker_id, offset);
|
||||
}
|
||||
|
||||
if(!instruction.get()) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
|
||||
{
|
||||
size_t tmp_isa_size = *isa_size;
|
||||
size_t tmp_source_size = *source_size;
|
||||
*isa_size = instruction->inst.size();
|
||||
*source_size = instruction->comment.size();
|
||||
|
||||
if(*isa_size > tmp_isa_size || *source_size > tmp_source_size)
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
memcpy(isa_instruction, instruction->inst.data(), *isa_size);
|
||||
memcpy(source_reference, instruction->comment.data(), *source_size);
|
||||
*isa_memory_size = instruction->size;
|
||||
|
||||
auto ptr = std::make_unique<isa_map_elem_t>();
|
||||
ptr->code_line = std::move(instruction);
|
||||
tool.isa_map.emplace(pcinfo_t{offset, marker_id}, std::move(ptr));
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void
|
||||
shader_data_callback(int64_t se_id,
|
||||
int64_t data_type_id,
|
||||
const char* data_type_name,
|
||||
void* se_data,
|
||||
size_t data_size,
|
||||
void* userdata)
|
||||
{
|
||||
assert(userdata && "Shader callback passed null!");
|
||||
ToolData& tool = *reinterpret_cast<ToolData*>(userdata);
|
||||
|
||||
{
|
||||
std::unique_lock<std::mutex> lk(tool.output_mut);
|
||||
tool.output << "SE ID: " << se_id << " with size " << data_size << std::hex << '\n';
|
||||
}
|
||||
trace_data_t data{.id = se_id, .data = (uint8_t*) se_data, .size = data_size, .tool = &tool};
|
||||
auto status = aqlprofile_att_parse_data(copy_trace_data, get_trace_data, isa_callback, &data);
|
||||
(void) status;
|
||||
}
|
||||
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
int
|
||||
tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
|
||||
{
|
||||
(void) fini_func;
|
||||
aqlprofile_att_parser_iterate_event_list(iterate_trace_types, nullptr);
|
||||
ROCPROFILER_CALL(rocprofiler_create_context(&client_ctx), "context creation");
|
||||
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_configure_callback_tracing_service(client_ctx,
|
||||
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT,
|
||||
nullptr,
|
||||
0,
|
||||
tool_codeobj_tracing_callback,
|
||||
tool_data),
|
||||
"code object tracing service configure");
|
||||
|
||||
rocprofiler_att_parameters_t parameters{};
|
||||
parameters.target_cu = TARGET_CU;
|
||||
parameters.simd_select = SIMD_SELECT;
|
||||
parameters.buffer_size = BUFFER_SIZE;
|
||||
|
||||
std::vector<int> shaders;
|
||||
for(size_t i = 0; i < NUM_SE; i++)
|
||||
shaders.push_back(2 * i); // use shader engines 0, 2
|
||||
|
||||
parameters.shader_ids = shaders.data();
|
||||
parameters.shader_num = shaders.size();
|
||||
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_configure_thread_trace_service(
|
||||
client_ctx, parameters, dispatch_callback, shader_data_callback, tool_data),
|
||||
"thread trace service configure");
|
||||
|
||||
int valid_ctx = 0;
|
||||
ROCPROFILER_CALL(rocprofiler_context_is_valid(client_ctx, &valid_ctx),
|
||||
"context validity check");
|
||||
if(valid_ctx == 0)
|
||||
{
|
||||
// notify rocprofiler that initialization failed
|
||||
// and all the contexts, buffers, etc. created
|
||||
// should be ignored
|
||||
return -1;
|
||||
}
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_start_context(client_ctx), "context start");
|
||||
|
||||
// no errors
|
||||
return 0;
|
||||
}
|
||||
|
||||
void
|
||||
tool_fini(void* tool_data)
|
||||
{
|
||||
assert(tool_data && "tool_fini callback passed null!");
|
||||
ToolData& tool = *reinterpret_cast<ToolData*>(tool_data);
|
||||
|
||||
std::unique_lock<std::shared_mutex> isa_lk(tool.isa_map_mut);
|
||||
std::unique_lock<std::mutex> out_lk(tool.output_mut);
|
||||
|
||||
// Find largest instruction
|
||||
size_t max_inst_size = 0;
|
||||
for(auto& [addr, lines] : tool.isa_map)
|
||||
if(lines.get()) max_inst_size = std::max(max_inst_size, lines->code_line->inst.size());
|
||||
|
||||
std::string empty_space;
|
||||
empty_space.resize(max_inst_size, ' ');
|
||||
|
||||
size_t vmc_latency = 0;
|
||||
size_t lgk_latency = 0;
|
||||
size_t scalar_latency = 0;
|
||||
size_t vector_latency = 0;
|
||||
size_t other_latency = 0;
|
||||
|
||||
size_t scalar_exec = 0;
|
||||
size_t vector_exec = 0;
|
||||
size_t other_exec = 0;
|
||||
|
||||
for(auto& [addr, line] : tool.isa_map)
|
||||
if(line.get())
|
||||
{
|
||||
size_t hitcount = line->hitcount.load(std::memory_order_relaxed);
|
||||
size_t latency = line->latency.load(std::memory_order_relaxed);
|
||||
auto& code_line = line->code_line->inst;
|
||||
|
||||
tool.output << std::hex << "0x" << addr.addr << std::dec << ' ' << code_line
|
||||
<< empty_space.substr(0, max_inst_size - code_line.size())
|
||||
<< " Hit: " << hitcount << " - Latency: " << latency << '\n';
|
||||
|
||||
if(code_line.find("s_waitcnt") == 0)
|
||||
{
|
||||
other_exec += hitcount;
|
||||
if(code_line.find("lgkmcnt") != std::string::npos)
|
||||
lgk_latency += latency;
|
||||
else
|
||||
vmc_latency += latency;
|
||||
}
|
||||
else if(code_line.find("v_") == 0)
|
||||
{
|
||||
vector_exec += hitcount;
|
||||
vector_latency += latency;
|
||||
}
|
||||
else if(code_line.find("s_") == 0)
|
||||
{
|
||||
scalar_exec += hitcount;
|
||||
scalar_latency += latency;
|
||||
}
|
||||
else
|
||||
{
|
||||
other_exec += hitcount;
|
||||
other_latency += latency;
|
||||
}
|
||||
}
|
||||
|
||||
size_t total_exec = vector_exec + scalar_exec + other_exec;
|
||||
size_t memory_latency = vmc_latency + lgk_latency;
|
||||
size_t total_latency = memory_latency + vector_latency + scalar_latency + other_latency;
|
||||
float vmc_fraction = 100 * vmc_latency / float(total_latency);
|
||||
float lgk_fraction = 100 * lgk_latency / float(total_latency);
|
||||
|
||||
tool.output << "Total executed instructions: " << total_exec << '\n'
|
||||
<< "Total executed vector instructions: " << vector_exec << " with average "
|
||||
<< vector_latency / float(vector_exec) << " cycles.\n"
|
||||
<< "Total executed scalar instructions: " << scalar_exec << " with average "
|
||||
<< scalar_latency / float(scalar_exec) << " cycles.\n"
|
||||
<< "Vector memory ops occupied: " << vmc_fraction << "% of cycles.\n"
|
||||
<< "Scalar and LDS memory ops occupied: " << lgk_fraction << "% of cycles.\n";
|
||||
|
||||
std::cout << tool.output.str();
|
||||
}
|
||||
|
||||
void
|
||||
setup()
|
||||
{
|
||||
if(int status = 0;
|
||||
rocprofiler_is_initialized(&status) == ROCPROFILER_STATUS_SUCCESS && status == 0)
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_force_configure(&rocprofiler_configure),
|
||||
"force configuration");
|
||||
}
|
||||
}
|
||||
|
||||
// force configuration when library is loaded
|
||||
bool cfg_on_load = (client::setup(), true);
|
||||
} // namespace client
|
||||
|
||||
extern "C" rocprofiler_tool_configure_result_t*
|
||||
rocprofiler_configure(uint32_t version,
|
||||
const char* runtime_version,
|
||||
uint32_t priority,
|
||||
rocprofiler_client_id_t* id)
|
||||
{
|
||||
// only activate if main tool
|
||||
if(priority > 0) return nullptr;
|
||||
|
||||
// set the client name
|
||||
id->name = "Adv_Thread_Trace_Sample";
|
||||
|
||||
// store client info
|
||||
client::client_id = id;
|
||||
|
||||
// compute major/minor/patch version info
|
||||
uint32_t major = version / 10000;
|
||||
uint32_t minor = (version % 10000) / 100;
|
||||
uint32_t patch = version % 100;
|
||||
|
||||
// generate info string
|
||||
auto info = std::stringstream{};
|
||||
info << id->name << " is using rocprofiler-sdk v" << major << "." << minor << "." << patch
|
||||
<< " (" << runtime_version << ")";
|
||||
|
||||
std::clog << info.str() << std::endl;
|
||||
|
||||
auto* data = new client::ToolData{};
|
||||
|
||||
// create configure data
|
||||
static auto cfg =
|
||||
rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t),
|
||||
&client::tool_init,
|
||||
&client::tool_fini,
|
||||
reinterpret_cast<void*>(data)};
|
||||
|
||||
// return pointer to configure data
|
||||
return &cfg;
|
||||
}
|
||||
@@ -0,0 +1,245 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <random>
|
||||
#include <stdexcept>
|
||||
#include "transpose_kernels.hpp"
|
||||
|
||||
#define PRINT_ALIGN 36
|
||||
|
||||
namespace
|
||||
{
|
||||
using lock_guard_t = std::lock_guard<std::mutex>;
|
||||
auto print_lock = std::mutex{};
|
||||
} // namespace
|
||||
|
||||
enum TransposeType
|
||||
{
|
||||
TRANSPOSE_NAIVE,
|
||||
TRANSPOSE_INPLACE_LDS,
|
||||
TRANSPOSE_NO_BANK_CONFLICTS
|
||||
};
|
||||
|
||||
class ITranspose
|
||||
{
|
||||
public:
|
||||
virtual void run(TransposeType ttype, int numThreadsY, int num_iter) = 0;
|
||||
virtual ~ITranspose(){};
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class Transpose : public ITranspose
|
||||
{
|
||||
public:
|
||||
Transpose(int dev, size_t _M)
|
||||
: devID(dev)
|
||||
, M(_M)
|
||||
, databytes(_M * _M * sizeof(T))
|
||||
{
|
||||
HIP_API_CALL(hipSetDevice(devID));
|
||||
HIP_API_CALL(hipStreamCreate(&stream));
|
||||
|
||||
std::default_random_engine _engine{std::random_device{}() * rand()};
|
||||
std::uniform_int_distribution<int> _dist{0, 1000};
|
||||
|
||||
inp_matrix = new T[M * M];
|
||||
out_matrix = new T[M * M];
|
||||
|
||||
for(size_t i = 0; i < M * M; i++)
|
||||
inp_matrix[i] = static_cast<T>(_dist(_engine));
|
||||
memset(out_matrix, 0, databytes);
|
||||
|
||||
HIP_API_CALL(hipMalloc(&in, databytes));
|
||||
HIP_API_CALL(hipMalloc(&out, databytes));
|
||||
HIP_API_CALL(hipMemsetAsync(in, 0, databytes, stream));
|
||||
HIP_API_CALL(hipMemsetAsync(out, 0, databytes, stream));
|
||||
HIP_API_CALL(hipMemcpyAsync(in, inp_matrix, databytes, hipMemcpyDefault, stream));
|
||||
|
||||
HIP_API_CALL(hipEventCreate(&start));
|
||||
HIP_API_CALL(hipEventCreate(&stop));
|
||||
}
|
||||
|
||||
void run(TransposeType ttype, int numThreadsY, int num_iter) override
|
||||
{
|
||||
HIP_API_CALL(hipSetDevice(devID));
|
||||
dim3 grid(M / TILE_DIM, M / TILE_DIM, 1);
|
||||
dim3 block(TILE_DIM, numThreadsY, 1);
|
||||
|
||||
auto Kernel = transposeNaive<T>;
|
||||
std::string KernelName = "transposeNaive";
|
||||
if(ttype == TransposeType::TRANSPOSE_NO_BANK_CONFLICTS)
|
||||
{
|
||||
Kernel = transposeLdsNoBankConflicts<T>;
|
||||
KernelName = "transposeLdsNoBankConflicts";
|
||||
}
|
||||
else if(ttype == TransposeType::TRANSPOSE_INPLACE_LDS)
|
||||
{
|
||||
Kernel = transposeLdsSwapInplace<T>;
|
||||
KernelName = "transposeLdsSwapInplace";
|
||||
}
|
||||
|
||||
{
|
||||
std::string functypeid = __PRETTY_FUNCTION__;
|
||||
auto it_beg = functypeid.rfind("[T = ");
|
||||
auto it_end = functypeid.rfind(']');
|
||||
|
||||
if(it_beg != std::string::npos) it_beg += std::string("[T = ").size();
|
||||
|
||||
if(it_beg < it_end && it_end != std::string::npos)
|
||||
KernelName += '<' + functypeid.substr(it_beg, it_end - it_beg) + '>';
|
||||
}
|
||||
|
||||
HIP_API_CALL(hipStreamSynchronize(stream));
|
||||
HIP_API_CALL(hipEventRecord(start, stream));
|
||||
|
||||
for(int i = 0; i < num_iter; i++)
|
||||
{
|
||||
Kernel<<<grid, block, 0, stream>>>(out, in, M);
|
||||
HIP_API_CALL(hipGetLastError());
|
||||
}
|
||||
|
||||
HIP_API_CALL(hipEventRecord(stop, stream));
|
||||
HIP_API_CALL(hipMemcpyAsync(out_matrix, out, databytes, hipMemcpyDefault, stream));
|
||||
HIP_API_CALL(hipEventSynchronize(stop));
|
||||
|
||||
float time;
|
||||
HIP_API_CALL(hipEventElapsedTime(&time, start, stop));
|
||||
float GB = databytes * num_iter * 2 / float(1 << 30);
|
||||
|
||||
{
|
||||
lock_guard_t _lk{print_lock};
|
||||
std::cout << "The average performance of " << std::setw(38) << KernelName << " : "
|
||||
<< (1000 * GB / time) << " GB/s" << std::endl;
|
||||
}
|
||||
|
||||
verify();
|
||||
}
|
||||
|
||||
void verify() const
|
||||
{
|
||||
HIP_API_CALL(hipStreamSynchronize(stream));
|
||||
for(int i = 0; i < 10; i++)
|
||||
{
|
||||
int row = rand() % M;
|
||||
int col = rand() % M;
|
||||
if(inp_matrix[row * M + col] != out_matrix[col * M + row])
|
||||
{
|
||||
lock_guard_t _lk{print_lock};
|
||||
std::cout << "mismatch: " << row << ", " << col << " : "
|
||||
<< inp_matrix[row * M + col] << " | " << out_matrix[col * M + row]
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
virtual ~Transpose()
|
||||
{
|
||||
HIP_API_CALL(hipSetDevice(devID));
|
||||
HIP_API_CALL(hipEventDestroy(start));
|
||||
HIP_API_CALL(hipEventDestroy(stop));
|
||||
|
||||
HIP_API_CALL(hipFree(in));
|
||||
HIP_API_CALL(hipFree(out));
|
||||
HIP_API_CALL(hipStreamDestroy(stream));
|
||||
|
||||
delete[] inp_matrix;
|
||||
delete[] out_matrix;
|
||||
}
|
||||
|
||||
const int devID;
|
||||
const size_t M;
|
||||
const size_t databytes;
|
||||
|
||||
hipStream_t stream;
|
||||
hipEvent_t start, stop;
|
||||
|
||||
T* inp_matrix = nullptr;
|
||||
T* out_matrix = nullptr;
|
||||
|
||||
T* in = nullptr;
|
||||
T* out = nullptr;
|
||||
};
|
||||
|
||||
int
|
||||
main(int argc, char** argv)
|
||||
{
|
||||
int deviceId = 0;
|
||||
int blockDimY = 8;
|
||||
int num_iter = 1;
|
||||
int mat_size = 8192;
|
||||
|
||||
for(int i = 1; i < argc; ++i)
|
||||
{
|
||||
auto _arg = std::string{argv[i]};
|
||||
if(_arg == "?" || _arg == "-h" || _arg == "--help")
|
||||
{
|
||||
std::cout << "usage: transpose "
|
||||
<< "[MatrixSize (" << mat_size << ")] "
|
||||
<< "[numIter (" << num_iter << ")] "
|
||||
<< "[blockDimY (" << blockDimY << ")] "
|
||||
<< "[DEVICE_ID (" << deviceId << ")] " << std::endl;
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
}
|
||||
if(argc > 1) mat_size = atoll(argv[1]);
|
||||
if(argc > 2) num_iter = atoll(argv[2]);
|
||||
if(argc > 3) blockDimY = atoll(argv[3]);
|
||||
if(argc > 4) deviceId = atoll(argv[4]);
|
||||
|
||||
printf("[transpose] Matrix size: %d, device ID: %d, num iter: %d, blockDimY: %d\n",
|
||||
mat_size,
|
||||
deviceId,
|
||||
num_iter,
|
||||
blockDimY);
|
||||
|
||||
int ndevice = 0;
|
||||
HIP_API_CALL(hipGetDeviceCount(&ndevice));
|
||||
printf("[transpose] Number of devices found: %i\n", ndevice);
|
||||
assert(ndevice > 0);
|
||||
|
||||
if(deviceId >= ndevice) exit(EXIT_FAILURE);
|
||||
|
||||
{
|
||||
std::vector<std::unique_ptr<ITranspose>> kernels;
|
||||
kernels.push_back(std::make_unique<Transpose<int>>(deviceId, mat_size));
|
||||
kernels.push_back(std::make_unique<Transpose<float>>(deviceId, mat_size));
|
||||
kernels.push_back(std::make_unique<Transpose<double>>(deviceId, mat_size));
|
||||
|
||||
for(auto& kernel : kernels)
|
||||
{
|
||||
kernel->run(TransposeType::TRANSPOSE_NAIVE, blockDimY, num_iter);
|
||||
kernel->run(TransposeType::TRANSPOSE_INPLACE_LDS, blockDimY, num_iter);
|
||||
kernel->run(TransposeType::TRANSPOSE_NO_BANK_CONFLICTS, blockDimY, num_iter);
|
||||
}
|
||||
}
|
||||
|
||||
HIP_API_CALL(hipDeviceSynchronize());
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,84 @@
|
||||
#pragma once
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
|
||||
#define HIP_API_CALL(CALL) \
|
||||
{ \
|
||||
hipError_t error_ = (CALL); \
|
||||
if(error_ != hipSuccess) \
|
||||
{ \
|
||||
lock_guard_t _hip_api_print_lk{print_lock}; \
|
||||
fprintf(stderr, \
|
||||
"%s:%d :: HIP error : %s\n", \
|
||||
__FILE__, \
|
||||
__LINE__, \
|
||||
hipGetErrorString(error_)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define TILE_DIM 64
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
transposeNaive(T* odata, const T* idata, size_t size)
|
||||
{
|
||||
size_t idx = blockIdx.x * TILE_DIM + threadIdx.x;
|
||||
size_t block_posy = blockIdx.y * TILE_DIM;
|
||||
|
||||
for(size_t idy = threadIdx.y; idy < TILE_DIM; idy += blockDim.y)
|
||||
odata[size * idx + block_posy + idy] = idata[idx + (block_posy + idy) * size];
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
transposeLdsNoBankConflicts(T* odata, const T* idata, size_t size)
|
||||
{
|
||||
__shared__ T tile[TILE_DIM][TILE_DIM + 1];
|
||||
|
||||
size_t idx_in = blockIdx.x * TILE_DIM + threadIdx.x;
|
||||
size_t idy_in = blockIdx.y * TILE_DIM + threadIdx.y;
|
||||
size_t index_in = idx_in + idy_in * size;
|
||||
|
||||
size_t idx_out = blockIdx.y * TILE_DIM + threadIdx.x;
|
||||
size_t idy_out = blockIdx.x * TILE_DIM + threadIdx.y;
|
||||
size_t index_out = idx_out + idy_out * size;
|
||||
|
||||
for(size_t y = 0; y < TILE_DIM; y += blockDim.y)
|
||||
tile[threadIdx.y + y][threadIdx.x] = idata[index_in + y * size];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(size_t y = 0; y < TILE_DIM; y += blockDim.y)
|
||||
odata[index_out + y * size] = tile[threadIdx.x][threadIdx.y + y];
|
||||
}
|
||||
|
||||
// Generates more interesting ISA
|
||||
template <typename T>
|
||||
__global__ void
|
||||
transposeLdsSwapInplace(T* odata, const T* idata, size_t size)
|
||||
{
|
||||
__shared__ T tile[TILE_DIM][TILE_DIM];
|
||||
|
||||
const size_t idx_in = blockIdx.x * TILE_DIM + threadIdx.x;
|
||||
|
||||
for(size_t idy = threadIdx.y; idy < TILE_DIM; idy += blockDim.y)
|
||||
tile[idy][threadIdx.x] = idata[idx_in + (idy + blockIdx.y * TILE_DIM) * size];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(size_t idy = threadIdx.y; idy < TILE_DIM; idy += blockDim.y)
|
||||
if(idy < threadIdx.x)
|
||||
{
|
||||
T temp = tile[idy][threadIdx.x];
|
||||
tile[idy][threadIdx.x] = tile[threadIdx.x][idy];
|
||||
tile[threadIdx.x][idy] = temp;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
const size_t idx_out = blockIdx.y * TILE_DIM + threadIdx.x;
|
||||
|
||||
for(size_t idy = threadIdx.y; idy < TILE_DIM; idy += blockDim.y)
|
||||
odata[(blockIdx.x * TILE_DIM + idy) * size + idx_out] = tile[idy][threadIdx.x];
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
#
|
||||
#
|
||||
#
|
||||
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
|
||||
|
||||
if(NOT CMAKE_HIP_COMPILER)
|
||||
find_program(
|
||||
amdclangpp_EXECUTABLE
|
||||
NAMES amdclang++
|
||||
HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
|
||||
PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
|
||||
PATH_SUFFIXES bin llvm/bin NO_CACHE)
|
||||
mark_as_advanced(amdclangpp_EXECUTABLE)
|
||||
|
||||
if(amdclangpp_EXECUTABLE)
|
||||
set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
project(rocprofiler-sdk-samples-code-object-isa-decode LANGUAGES CXX HIP)
|
||||
|
||||
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
|
||||
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
|
||||
set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
find_package(rocprofiler-sdk REQUIRED)
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
add_executable(code-object-isa-decode)
|
||||
target_sources(code-object-isa-decode PRIVATE main.cpp client.cpp)
|
||||
set_source_files_properties(main.cpp PROPERTIES LANGUAGE HIP)
|
||||
set_source_files_properties(main.cpp PROPERTIES COMPILE_FLAGS "-g")
|
||||
|
||||
target_link_libraries(
|
||||
code-object-isa-decode PRIVATE Threads::Threads rocprofiler::samples-common-library
|
||||
rocprofiler-sdk-codeobj rocprofiler::rocprofiler)
|
||||
|
||||
add_test(NAME code-object-isa-decode COMMAND $<TARGET_FILE:code-object-isa-decode>)
|
||||
|
||||
set_tests_properties(
|
||||
code-object-isa-decode
|
||||
PROPERTIES
|
||||
TIMEOUT
|
||||
45
|
||||
LABELS
|
||||
"samples"
|
||||
ENVIRONMENT
|
||||
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV};HSA_TOOLS_LIB=$<TARGET_FILE:rocprofiler::rocprofiler-shared-library>"
|
||||
FAIL_REGULAR_EXPRESSION
|
||||
"threw an exception")
|
||||
@@ -0,0 +1,251 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
//
|
||||
// undefine NDEBUG so asserts are implemented
|
||||
#ifdef NDEBUG
|
||||
# undef NDEBUG
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @file samples/code_object_isa_decode/client.cpp
|
||||
*
|
||||
* @brief Example rocprofiler client (tool)
|
||||
*/
|
||||
|
||||
#include <rocprofiler-sdk/buffer.h>
|
||||
#include <rocprofiler-sdk/callback_tracing.h>
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/registration.h>
|
||||
#include <rocprofiler-sdk/rocprofiler.h>
|
||||
#include "lib/rocprofiler-sdk-codeobj/code_printing.hpp"
|
||||
|
||||
#include "common/defines.hpp"
|
||||
#include "common/filesystem.hpp"
|
||||
|
||||
#include <cxxabi.h>
|
||||
#include <atomic>
|
||||
#include <cassert>
|
||||
#include <chrono>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
#include <regex>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include "code_object_track.hpp"
|
||||
|
||||
namespace client
|
||||
{
|
||||
namespace
|
||||
{
|
||||
using code_obj_load_data_t = rocprofiler_callback_tracing_code_object_load_data_t;
|
||||
using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t;
|
||||
using kernel_symbol_map_t = std::unordered_map<std::string, std::pair<uint64_t, size_t>>;
|
||||
|
||||
rocprofiler_client_id_t* client_id = nullptr;
|
||||
rocprofiler_client_finalize_t client_fini_func = nullptr;
|
||||
rocprofiler_context_id_t client_ctx = {};
|
||||
kernel_symbol_map_t registered_kernels = {};
|
||||
|
||||
CodeobjAddressTranslate codeobjTranslate;
|
||||
|
||||
void
|
||||
tool_codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record,
|
||||
rocprofiler_user_data_t* user_data,
|
||||
void* callback_data)
|
||||
{
|
||||
if(record.kind != ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT) return;
|
||||
if(record.phase != ROCPROFILER_CALLBACK_PHASE_LOAD) return;
|
||||
|
||||
if(record.operation == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_LOAD)
|
||||
{
|
||||
auto* data = static_cast<code_obj_load_data_t*>(record.payload);
|
||||
|
||||
if(std::string_view(data->uri).find("file:///") == 0)
|
||||
{
|
||||
codeobjTranslate.addDecoder(
|
||||
data->uri, data->code_object_id, data->load_base, data->load_size);
|
||||
auto symbolmap = codeobjTranslate.getSymbolMap();
|
||||
for(auto& [vaddr, symbol] : symbolmap)
|
||||
registered_kernels.insert({symbol.name, {vaddr, vaddr + symbol.mem_size}});
|
||||
}
|
||||
}
|
||||
else if(record.operation ==
|
||||
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER)
|
||||
{
|
||||
std::cout << std::hex;
|
||||
auto* data = static_cast<kernel_symbol_data_t*>(record.payload);
|
||||
auto kernel_name = std::regex_replace(data->kernel_name, std::regex{"(\\.kd)$"}, "");
|
||||
|
||||
if(registered_kernels.find(kernel_name) == registered_kernels.end())
|
||||
{
|
||||
std::cout << "Not Found: " << kernel_name << " in codeobj." << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
auto& begin_end = registered_kernels.at(kernel_name);
|
||||
|
||||
std::cout << std::hex << "Found: " << kernel_name << " at addr: 0x" << begin_end.first
|
||||
<< std::dec << ". Printing first 64 bytes:" << std::endl;
|
||||
|
||||
std::unordered_set<std::string> references{};
|
||||
int num_waitcnts = 0;
|
||||
int num_scalar = 0;
|
||||
int num_vector = 0;
|
||||
int num_other = 0;
|
||||
|
||||
size_t vaddr = begin_end.first;
|
||||
while(vaddr < begin_end.second)
|
||||
{
|
||||
auto inst = codeobjTranslate.get(vaddr);
|
||||
std::string_view source = inst->comment;
|
||||
if(source.rfind('/') < source.size()) source = source.substr(source.rfind('/'));
|
||||
if(vaddr < begin_end.first + 64) std::cout << '\t' << inst->inst << '\n';
|
||||
|
||||
if(source.rfind(':') < source.size()) source = source.substr(0, source.rfind(':'));
|
||||
|
||||
references.insert(std::string(source));
|
||||
if(inst->inst.find("v_") == 0)
|
||||
num_vector++;
|
||||
else if(inst->inst.find("s_waitcnt") == 0)
|
||||
num_waitcnts++;
|
||||
else if(inst->inst.find("s_") == 0)
|
||||
num_scalar++;
|
||||
else
|
||||
num_other++;
|
||||
|
||||
vaddr += inst->size;
|
||||
}
|
||||
|
||||
std::cout << " --- Num Scalar: " << num_scalar << "\n --- Num Vector: " << num_vector
|
||||
<< "\n --- Num Waitcnts: " << num_waitcnts
|
||||
<< "\n --- Other instructions: " << num_other
|
||||
<< "\nKernel has source references to: " << std::endl;
|
||||
for(auto& ref : references)
|
||||
std::cout << '\t' << ref << std::endl;
|
||||
}
|
||||
|
||||
(void) user_data;
|
||||
(void) callback_data;
|
||||
}
|
||||
|
||||
int
|
||||
tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
|
||||
{
|
||||
client_fini_func = fini_func;
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_create_context(&client_ctx), "context creation");
|
||||
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_configure_callback_tracing_service(client_ctx,
|
||||
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT,
|
||||
nullptr,
|
||||
0,
|
||||
tool_codeobj_tracing_callback,
|
||||
tool_data),
|
||||
"code object tracing service configure");
|
||||
|
||||
int valid_ctx = 0;
|
||||
ROCPROFILER_CALL(rocprofiler_context_is_valid(client_ctx, &valid_ctx),
|
||||
"context validity check");
|
||||
if(valid_ctx == 0)
|
||||
{
|
||||
// notify rocprofiler that initialization failed
|
||||
// and all the contexts, buffers, etc. created
|
||||
// should be ignored
|
||||
return -1;
|
||||
}
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_start_context(client_ctx), "context start");
|
||||
|
||||
// no errors
|
||||
return 0;
|
||||
}
|
||||
|
||||
void
|
||||
tool_fini(void* tool_data)
|
||||
{
|
||||
(void) tool_data;
|
||||
}
|
||||
|
||||
void
|
||||
setup()
|
||||
{
|
||||
if(int status = 0;
|
||||
rocprofiler_is_initialized(&status) == ROCPROFILER_STATUS_SUCCESS && status == 0)
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_force_configure(&rocprofiler_configure),
|
||||
"force configuration");
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
// force configuration when library is loaded
|
||||
bool cfg_on_load = (client::setup(), true);
|
||||
} // namespace client
|
||||
|
||||
extern "C" rocprofiler_tool_configure_result_t*
|
||||
rocprofiler_configure(uint32_t version,
|
||||
const char* runtime_version,
|
||||
uint32_t priority,
|
||||
rocprofiler_client_id_t* id)
|
||||
{
|
||||
// only activate if main tool
|
||||
if(priority > 0) return nullptr;
|
||||
|
||||
// set the client name
|
||||
id->name = "ExampleTool";
|
||||
|
||||
// store client info
|
||||
client::client_id = id;
|
||||
|
||||
// compute major/minor/patch version info
|
||||
uint32_t major = version / 10000;
|
||||
uint32_t minor = (version % 10000) / 100;
|
||||
uint32_t patch = version % 100;
|
||||
|
||||
// generate info string
|
||||
auto info = std::stringstream{};
|
||||
info << id->name << " is using rocprofiler-sdk v" << major << "." << minor << "." << patch
|
||||
<< " (" << runtime_version << ")";
|
||||
|
||||
std::clog << info.str() << std::endl;
|
||||
|
||||
// create configure data
|
||||
static auto cfg =
|
||||
rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t),
|
||||
&client::tool_init,
|
||||
&client::tool_fini,
|
||||
nullptr};
|
||||
|
||||
// return pointer to configure data
|
||||
return &cfg;
|
||||
}
|
||||
@@ -0,0 +1,246 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <random>
|
||||
#include <stdexcept>
|
||||
#include "transpose_kernels.hpp"
|
||||
|
||||
#define PRINT_ALIGN 36
|
||||
|
||||
namespace
|
||||
{
|
||||
using lock_guard_t = std::lock_guard<std::mutex>;
|
||||
auto print_lock = std::mutex{};
|
||||
} // namespace
|
||||
|
||||
enum TransposeType
|
||||
{
|
||||
TRANSPOSE_NAIVE,
|
||||
TRANSPOSE_INPLACE_LDS,
|
||||
TRANSPOSE_NO_BANK_CONFLICTS
|
||||
};
|
||||
|
||||
class ITranspose
|
||||
{
|
||||
public:
|
||||
virtual void run(TransposeType ttype, int numThreadsY, int num_iter) = 0;
|
||||
virtual ~ITranspose(){};
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class Transpose : public ITranspose
|
||||
{
|
||||
public:
|
||||
Transpose(int dev, size_t _M)
|
||||
: devID(dev)
|
||||
, M(_M)
|
||||
, databytes(_M * _M * sizeof(T))
|
||||
{
|
||||
HIP_API_CALL(hipSetDevice(devID));
|
||||
HIP_API_CALL(hipStreamCreate(&stream));
|
||||
|
||||
std::default_random_engine _engine{std::random_device{}() * rand()};
|
||||
std::uniform_int_distribution<int> _dist{0, 1000};
|
||||
|
||||
inp_matrix = new T[M * M];
|
||||
out_matrix = new T[M * M];
|
||||
|
||||
for(size_t i = 0; i < M * M; i++)
|
||||
inp_matrix[i] = static_cast<T>(_dist(_engine));
|
||||
memset(out_matrix, 0, databytes);
|
||||
|
||||
HIP_API_CALL(hipMalloc(&in, databytes));
|
||||
HIP_API_CALL(hipMalloc(&out, databytes));
|
||||
HIP_API_CALL(hipMemsetAsync(in, 0, databytes, stream));
|
||||
HIP_API_CALL(hipMemsetAsync(out, 0, databytes, stream));
|
||||
HIP_API_CALL(hipMemcpyAsync(in, inp_matrix, databytes, hipMemcpyDefault, stream));
|
||||
|
||||
HIP_API_CALL(hipEventCreate(&start));
|
||||
HIP_API_CALL(hipEventCreate(&stop));
|
||||
}
|
||||
|
||||
void run(TransposeType ttype, int numThreadsY, int num_iter) override
|
||||
{
|
||||
HIP_API_CALL(hipSetDevice(devID));
|
||||
dim3 grid(M / TILE_DIM, M / TILE_DIM, 1);
|
||||
dim3 block(TILE_DIM, numThreadsY, 1);
|
||||
|
||||
auto Kernel = transposeNaive<T>;
|
||||
std::string KernelName = "transposeNaive";
|
||||
if(ttype == TransposeType::TRANSPOSE_NO_BANK_CONFLICTS)
|
||||
{
|
||||
Kernel = transposeLdsNoBankConflicts<T>;
|
||||
KernelName = "transposeLdsNoBankConflicts";
|
||||
}
|
||||
else if(ttype == TransposeType::TRANSPOSE_INPLACE_LDS)
|
||||
{
|
||||
Kernel = transposeLdsSwapInplace<T>;
|
||||
KernelName = "transposeLdsSwapInplace";
|
||||
}
|
||||
|
||||
{
|
||||
std::string functypeid = __PRETTY_FUNCTION__;
|
||||
auto it_beg = functypeid.rfind("[T = ");
|
||||
auto it_end = functypeid.rfind(']');
|
||||
|
||||
if(it_beg != std::string::npos) it_beg += std::string("[T = ").size();
|
||||
|
||||
if(it_beg < it_end && it_end != std::string::npos)
|
||||
KernelName += '<' + functypeid.substr(it_beg, it_end - it_beg) + '>';
|
||||
}
|
||||
|
||||
HIP_API_CALL(hipStreamSynchronize(stream));
|
||||
HIP_API_CALL(hipEventRecord(start, stream));
|
||||
|
||||
for(int i = 0; i < num_iter; i++)
|
||||
{
|
||||
Kernel<<<grid, block, 0, stream>>>(out, in, M);
|
||||
HIP_API_CALL(hipGetLastError());
|
||||
}
|
||||
|
||||
HIP_API_CALL(hipEventRecord(stop, stream));
|
||||
HIP_API_CALL(hipMemcpyAsync(out_matrix, out, databytes, hipMemcpyDefault, stream));
|
||||
HIP_API_CALL(hipEventSynchronize(stop));
|
||||
|
||||
float time;
|
||||
HIP_API_CALL(hipEventElapsedTime(&time, start, stop));
|
||||
float GB = databytes * num_iter * 2 / float(1 << 30);
|
||||
|
||||
{
|
||||
lock_guard_t _lk{print_lock};
|
||||
std::cout << "The average performance of " << std::setw(38) << KernelName << " : "
|
||||
<< (1000 * GB / time) << " GB/s" << std::endl;
|
||||
}
|
||||
|
||||
verify();
|
||||
}
|
||||
|
||||
void verify() const
|
||||
{
|
||||
HIP_API_CALL(hipStreamSynchronize(stream));
|
||||
for(int i = 0; i < 10; i++)
|
||||
{
|
||||
int row = rand() % M;
|
||||
int col = rand() % M;
|
||||
if(inp_matrix[row * M + col] != out_matrix[col * M + row])
|
||||
{
|
||||
lock_guard_t _lk{print_lock};
|
||||
std::cout << "mismatch: " << row << ", " << col << " : "
|
||||
<< inp_matrix[row * M + col] << " | " << out_matrix[col * M + row]
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
virtual ~Transpose()
|
||||
{
|
||||
HIP_API_CALL(hipSetDevice(devID));
|
||||
HIP_API_CALL(hipEventDestroy(start));
|
||||
HIP_API_CALL(hipEventDestroy(stop));
|
||||
|
||||
HIP_API_CALL(hipFree(in));
|
||||
HIP_API_CALL(hipFree(out));
|
||||
HIP_API_CALL(hipStreamDestroy(stream));
|
||||
|
||||
delete[] inp_matrix;
|
||||
delete[] out_matrix;
|
||||
}
|
||||
|
||||
const int devID;
|
||||
const size_t M;
|
||||
const size_t databytes;
|
||||
|
||||
hipStream_t stream;
|
||||
hipEvent_t start, stop;
|
||||
|
||||
T* inp_matrix = nullptr;
|
||||
T* out_matrix = nullptr;
|
||||
|
||||
T* in = nullptr;
|
||||
T* out = nullptr;
|
||||
};
|
||||
|
||||
int
|
||||
main(int argc, char** argv)
|
||||
{
|
||||
int deviceId = 0;
|
||||
int blockDimY = 8;
|
||||
int num_iter = 3;
|
||||
int mat_size = 8192;
|
||||
|
||||
for(int i = 1; i < argc; ++i)
|
||||
{
|
||||
auto _arg = std::string{argv[i]};
|
||||
if(_arg == "?" || _arg == "-h" || _arg == "--help")
|
||||
{
|
||||
std::cout << "usage: transpose "
|
||||
<< "[MatrixSize (" << mat_size << ")] "
|
||||
<< "[numIter (" << num_iter << ")] "
|
||||
<< "[blockDimY (" << blockDimY << ")] "
|
||||
<< "[DEVICE_ID (" << deviceId << ")] " << std::endl;
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
}
|
||||
if(argc > 1) mat_size = atoll(argv[1]);
|
||||
if(argc > 2) num_iter = atoll(argv[2]);
|
||||
if(argc > 3) blockDimY = atoll(argv[3]);
|
||||
if(argc > 4) deviceId = atoll(argv[4]);
|
||||
|
||||
printf("[transpose] Matrix size: %d, device ID: %d, num iter: %d, blockDimY: %d\n",
|
||||
mat_size,
|
||||
deviceId,
|
||||
num_iter,
|
||||
blockDimY);
|
||||
|
||||
int ndevice = 0;
|
||||
HIP_API_CALL(hipGetDeviceCount(&ndevice));
|
||||
printf("[transpose] Number of devices found: %i\n", ndevice);
|
||||
assert(ndevice > 0);
|
||||
|
||||
if(deviceId >= ndevice) exit(EXIT_FAILURE);
|
||||
|
||||
{
|
||||
std::vector<std::unique_ptr<ITranspose>> kernels;
|
||||
kernels.push_back(std::make_unique<Transpose<int>>(deviceId, mat_size));
|
||||
kernels.push_back(std::make_unique<Transpose<float>>(deviceId, mat_size));
|
||||
kernels.push_back(std::make_unique<Transpose<double>>(deviceId, mat_size));
|
||||
|
||||
for(auto& kernel : kernels)
|
||||
{
|
||||
kernel->run(TransposeType::TRANSPOSE_NAIVE, blockDimY, num_iter);
|
||||
kernel->run(TransposeType::TRANSPOSE_INPLACE_LDS, blockDimY, num_iter);
|
||||
kernel->run(TransposeType::TRANSPOSE_NO_BANK_CONFLICTS, blockDimY, num_iter);
|
||||
}
|
||||
}
|
||||
|
||||
HIP_API_CALL(hipDeviceSynchronize());
|
||||
HIP_API_CALL(hipDeviceReset());
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,84 @@
|
||||
#pragma once
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
|
||||
#define HIP_API_CALL(CALL) \
|
||||
{ \
|
||||
hipError_t error_ = (CALL); \
|
||||
if(error_ != hipSuccess) \
|
||||
{ \
|
||||
lock_guard_t _hip_api_print_lk{print_lock}; \
|
||||
fprintf(stderr, \
|
||||
"%s:%d :: HIP error : %s\n", \
|
||||
__FILE__, \
|
||||
__LINE__, \
|
||||
hipGetErrorString(error_)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define TILE_DIM 64
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
transposeNaive(T* odata, const T* idata, size_t size)
|
||||
{
|
||||
size_t idx = blockIdx.x * TILE_DIM + threadIdx.x;
|
||||
size_t block_posy = blockIdx.y * TILE_DIM;
|
||||
|
||||
for(size_t idy = threadIdx.y; idy < TILE_DIM; idy += blockDim.y)
|
||||
odata[size * idx + block_posy + idy] = idata[idx + (block_posy + idy) * size];
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
transposeLdsNoBankConflicts(T* odata, const T* idata, size_t size)
|
||||
{
|
||||
__shared__ T tile[TILE_DIM][TILE_DIM + 1];
|
||||
|
||||
size_t idx_in = blockIdx.x * TILE_DIM + threadIdx.x;
|
||||
size_t idy_in = blockIdx.y * TILE_DIM + threadIdx.y;
|
||||
size_t index_in = idx_in + idy_in * size;
|
||||
|
||||
size_t idx_out = blockIdx.y * TILE_DIM + threadIdx.x;
|
||||
size_t idy_out = blockIdx.x * TILE_DIM + threadIdx.y;
|
||||
size_t index_out = idx_out + idy_out * size;
|
||||
|
||||
for(size_t y = 0; y < TILE_DIM; y += blockDim.y)
|
||||
tile[threadIdx.y + y][threadIdx.x] = idata[index_in + y * size];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(size_t y = 0; y < TILE_DIM; y += blockDim.y)
|
||||
odata[index_out + y * size] = tile[threadIdx.x][threadIdx.y + y];
|
||||
}
|
||||
|
||||
// Generates more interesting ISA
|
||||
template <typename T>
|
||||
__global__ void
|
||||
transposeLdsSwapInplace(T* odata, const T* idata, size_t size)
|
||||
{
|
||||
__shared__ T tile[TILE_DIM][TILE_DIM];
|
||||
|
||||
const size_t idx_in = blockIdx.x * TILE_DIM + threadIdx.x;
|
||||
|
||||
for(size_t idy = threadIdx.y; idy < TILE_DIM; idy += blockDim.y)
|
||||
tile[idy][threadIdx.x] = idata[idx_in + (idy + blockIdx.y * TILE_DIM) * size];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(size_t idy = threadIdx.y; idy < TILE_DIM; idy += blockDim.y)
|
||||
if(idy < threadIdx.x)
|
||||
{
|
||||
T temp = tile[idy][threadIdx.x];
|
||||
tile[idy][threadIdx.x] = tile[threadIdx.x][idy];
|
||||
tile[threadIdx.x][idy] = temp;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
const size_t idx_out = blockIdx.y * TILE_DIM + threadIdx.x;
|
||||
|
||||
for(size_t idy = threadIdx.y; idy < TILE_DIM; idy += blockDim.y)
|
||||
odata[(blockIdx.x * TILE_DIM + idy) * size + idx_out] = tile[idy][threadIdx.x];
|
||||
}
|
||||
@@ -30,6 +30,7 @@ set(ROCPROFILER_HEADER_FILES
|
||||
profile_config.h
|
||||
registration.h
|
||||
spm.h
|
||||
thread_trace.h
|
||||
${CMAKE_CURRENT_BINARY_DIR}/version.h)
|
||||
|
||||
install(
|
||||
|
||||
@@ -80,6 +80,7 @@ ROCPROFILER_EXTERN_C_FINI
|
||||
// #include "rocprofiler-sdk/marker.h"
|
||||
#include "rocprofiler-sdk/pc_sampling.h"
|
||||
#include "rocprofiler-sdk/profile_config.h"
|
||||
#include "rocprofiler-sdk/thread_trace.h"
|
||||
// #include "rocprofiler-sdk/spm.h"
|
||||
|
||||
ROCPROFILER_EXTERN_C_INIT
|
||||
|
||||
@@ -0,0 +1,108 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <rocprofiler-sdk/agent.h>
|
||||
#include <rocprofiler-sdk/defines.h>
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/hsa.h>
|
||||
|
||||
ROCPROFILER_EXTERN_C_INIT
|
||||
|
||||
/**
|
||||
* @defgroup THREAD_TRACE Thread Trace Service
|
||||
* @brief Provides API calls to enable and handle thread trace data
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
|
||||
typedef void (*rocprofiler_att_data_types_callback_t)(int64_t data_type_id,
|
||||
const char* data_type_name,
|
||||
void* userdata);
|
||||
|
||||
rocprofiler_status_t
|
||||
rocprofiler_att_iterate_data_types(rocprofiler_att_data_types_callback_t callback,
|
||||
void* userdata) ROCPROFILER_API;
|
||||
|
||||
typedef union
|
||||
{
|
||||
struct
|
||||
{
|
||||
uint32_t enable_async_queue : 1;
|
||||
uint32_t enable_occupancy_mode : 1;
|
||||
uint32_t enable_double_buffering : 1;
|
||||
uint32_t disable_att_markers : 1;
|
||||
uint32_t disable_software_header : 1;
|
||||
};
|
||||
uint32_t raw;
|
||||
} rocprofiler_att_parameter_flag_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
rocprofiler_att_parameter_flag_t flags;
|
||||
int shader_num;
|
||||
int* shader_ids;
|
||||
uint64_t buffer_size;
|
||||
uint8_t target_cu;
|
||||
uint8_t simd_select;
|
||||
uint8_t reserved;
|
||||
uint8_t vmid_mask;
|
||||
uint16_t perfcounter_mask;
|
||||
uint8_t perfcounter_ctrl;
|
||||
uint8_t perfcounter_num;
|
||||
const char** perfcounter;
|
||||
} rocprofiler_att_parameters_t;
|
||||
|
||||
typedef enum
|
||||
{
|
||||
ROCPROFILER_ATT_CONTROL_NONE = 0,
|
||||
ROCPROFILER_ATT_CONTROL_START = 1,
|
||||
ROCPROFILER_ATT_CONTROL_STOP = 2,
|
||||
ROCPROFILER_ATT_CONTROL_START_AND_STOP = 3
|
||||
} rocprofiler_att_control_flags_t;
|
||||
|
||||
typedef rocprofiler_att_control_flags_t (*rocprofiler_att_dispatch_callback_t)(
|
||||
rocprofiler_queue_id_t queue_id,
|
||||
const rocprofiler_agent_t* agent,
|
||||
rocprofiler_correlation_id_t correlation_id,
|
||||
const hsa_kernel_dispatch_packet_t* dispatch_packet,
|
||||
uint64_t kernel_id,
|
||||
void* userdata);
|
||||
|
||||
typedef void (*rocprofiler_att_shader_data_callback_t)(int64_t shader_engine_id,
|
||||
int64_t data_type_id,
|
||||
const char* data_type_name,
|
||||
void* data,
|
||||
size_t data_size,
|
||||
void* userdata);
|
||||
|
||||
rocprofiler_status_t
|
||||
rocprofiler_configure_thread_trace_service(rocprofiler_context_id_t context_id,
|
||||
rocprofiler_att_parameters_t parameters,
|
||||
rocprofiler_att_dispatch_callback_t dispatch_callback,
|
||||
rocprofiler_att_shader_data_callback_t shader_callback,
|
||||
void* callback_userdata) ROCPROFILER_API;
|
||||
|
||||
/** @} */
|
||||
|
||||
ROCPROFILER_EXTERN_C_FINI
|
||||
@@ -9,6 +9,8 @@ set(CMAKE_INSTALL_DEFAULT_COMPONENT_NAME "tools")
|
||||
add_subdirectory(rocprofiler-sdk-tool)
|
||||
add_subdirectory(rocprofiler-sdk-roctx)
|
||||
|
||||
add_subdirectory(rocprofiler-sdk-codeobj)
|
||||
|
||||
if(ROCPROFILER_BUILD_TESTS)
|
||||
add_subdirectory(tests)
|
||||
endif()
|
||||
|
||||
@@ -0,0 +1,58 @@
|
||||
# ##############################################################################
|
||||
# # Copyright (c) 2024 Advanced Micro Devices, Inc. # # Permission is hereby
|
||||
# granted, free of charge, to any person obtaining a copy # of this software and
|
||||
# associated documentation files (the "Software"), to # deal in the Software
|
||||
# without restriction, including without limitation the # rights to use, copy,
|
||||
# modify, merge, publish, distribute, sublicense, and/or # sell copies of the
|
||||
# Software, and to permit persons to whom the Software is # furnished to do so,
|
||||
# subject to the following conditions: # # The above copyright notice and this
|
||||
# permission notice shall be included in # all copies or substantial portions of
|
||||
# the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
|
||||
# KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
# MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
|
||||
# EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
|
||||
# DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
|
||||
# OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
|
||||
# USE OR OTHER DEALINGS # IN THE SOFTWARE.
|
||||
# ##############################################################################
|
||||
|
||||
set(CMAKE_INSTALL_DEFAULT_COMPONENT_NAME "codeobj")
|
||||
find_package(amd_comgr REQUIRED)
|
||||
|
||||
set(LIB_CODEOBJ_PARSER_SOURCES code_object_track.cpp code_printing.cpp disassembly.cpp)
|
||||
set(LIB_CODEOBJ_PARSER_HEADERS code_object_track.hpp code_printing.hpp disassembly.hpp
|
||||
segment.hpp)
|
||||
|
||||
add_library(rocprofiler-sdk-codeobj STATIC)
|
||||
target_sources(rocprofiler-sdk-codeobj PRIVATE ${LIB_CODEOBJ_PARSER_SOURCES})
|
||||
target_link_libraries(
|
||||
rocprofiler-sdk-codeobj
|
||||
PRIVATE amd_comgr dw elf rocprofiler::rocprofiler-build-flags
|
||||
rocprofiler::rocprofiler-memcheck rocprofiler::rocprofiler-common-library)
|
||||
|
||||
target_include_directories(rocprofiler-sdk-codeobj PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
if(ROCPROFILER_BUILD_CODECOV)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -coverage")
|
||||
target_link_libraries(rocprofiler-sdk-codeobj PUBLIC gcov)
|
||||
endif()
|
||||
|
||||
set_target_properties(
|
||||
rocprofiler-sdk-codeobj
|
||||
PROPERTIES LIBRARY_OUTPUT_DIRECTORY
|
||||
${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}/rocprofiler-sdk-codeobj
|
||||
SOVERSION ${PROJECT_VERSION_MAJOR}
|
||||
VERSION ${PROJECT_VERSION}
|
||||
BUILD_RPATH "\$ORIGIN:\$ORIGIN/.."
|
||||
INSTALL_RPATH "\$ORIGIN:\$ORIGIN/..")
|
||||
|
||||
install(
|
||||
TARGETS rocprofiler-sdk-codeobj
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/rocprofiler-sdk-codeobj
|
||||
COMPONENT tools
|
||||
EXPORT rocprofiler-sdk-codeobj-targets)
|
||||
|
||||
install(
|
||||
FILES ${LIB_CODEOBJ_PARSER_HEADERS}
|
||||
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rocprofiler-sdk-codeobj
|
||||
COMPONENT tools)
|
||||
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2024 AMD ROCm™ Software
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
@@ -0,0 +1 @@
|
||||
# rocprofiler-codeobj-parser
|
||||
@@ -0,0 +1,196 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include <sys/mman.h>
|
||||
#include <algorithm>
|
||||
#include <atomic>
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "code_object_track.hpp"
|
||||
|
||||
void
|
||||
CodeobjRecorder::Load(uint64_t addr,
|
||||
uint64_t load_size,
|
||||
const std::string& URI,
|
||||
uint64_t mem_addr,
|
||||
uint64_t mem_size,
|
||||
uint64_t id)
|
||||
{
|
||||
Load(std::make_shared<CodeobjCaptureInstance>(
|
||||
addr, load_size, URI, mem_addr, mem_size, id, capture_mode));
|
||||
}
|
||||
|
||||
void
|
||||
CodeobjCaptureInstance::copyCodeobjFromFile(uint64_t offset, uint64_t size, const std::string& path)
|
||||
{
|
||||
std::ifstream file(path, std::ios::in | std::ios::binary);
|
||||
if(!file)
|
||||
{
|
||||
printf("could not open `%s'\n", path.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
if(!size)
|
||||
{
|
||||
file.ignore(std::numeric_limits<std::streamsize>::max());
|
||||
size_t bytes = file.gcount();
|
||||
file.clear();
|
||||
|
||||
if(bytes < offset)
|
||||
{
|
||||
printf("invalid uri `%s' (file size < offset)\n", path.c_str());
|
||||
return;
|
||||
}
|
||||
size = bytes - offset;
|
||||
}
|
||||
|
||||
file.seekg(offset, std::ios_base::beg);
|
||||
buffer.resize(size);
|
||||
file.read(&buffer[0], size);
|
||||
}
|
||||
|
||||
void CodeobjCaptureInstance::copyCodeobjFromMemory(uint64_t, uint64_t)
|
||||
{
|
||||
// buffer.resize(mem_size);
|
||||
// std::memcpy(buffer.data(), (uint64_t*)mem_addr, mem_size);
|
||||
}
|
||||
|
||||
std::pair<size_t, size_t>
|
||||
CodeobjCaptureInstance::parse_uri()
|
||||
{
|
||||
const std::string protocol_delim{"://"};
|
||||
|
||||
size_t protocol_end = URI.find(protocol_delim);
|
||||
protocol = URI.substr(0, protocol_end);
|
||||
protocol_end += protocol_delim.length();
|
||||
|
||||
std::transform(protocol.begin(), protocol.end(), protocol.begin(), [](unsigned char c) {
|
||||
return std::tolower(c);
|
||||
});
|
||||
|
||||
std::string path;
|
||||
size_t path_end = URI.find_first_of("#?", protocol_end);
|
||||
if(path_end != std::string::npos)
|
||||
{
|
||||
path = URI.substr(protocol_end, path_end++ - protocol_end);
|
||||
}
|
||||
else
|
||||
{
|
||||
path = URI.substr(protocol_end);
|
||||
}
|
||||
|
||||
/* %-decode the string. */
|
||||
decoded_path = std::string{};
|
||||
decoded_path.reserve(path.length());
|
||||
for(size_t i = 0; i < path.length(); ++i)
|
||||
{
|
||||
if(path[i] == '%' && std::isxdigit(path[i + 1]) && std::isxdigit(path[i + 2]))
|
||||
{
|
||||
decoded_path += std::stoi(path.substr(i + 1, 2), 0, 16);
|
||||
i += 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
decoded_path += path[i];
|
||||
}
|
||||
}
|
||||
|
||||
/* Tokenize the query/fragment. */
|
||||
std::vector<std::string> tokens;
|
||||
size_t pos, last = path_end;
|
||||
while((pos = URI.find('&', last)) != std::string::npos)
|
||||
{
|
||||
tokens.emplace_back(URI.substr(last, pos - last));
|
||||
last = pos + 1;
|
||||
}
|
||||
if(last != std::string::npos) tokens.emplace_back(URI.substr(last));
|
||||
|
||||
/* Create a tag-value map from the tokenized query/fragment. */
|
||||
std::unordered_map<std::string, std::string> params;
|
||||
std::for_each(tokens.begin(), tokens.end(), [&](std::string& token) {
|
||||
size_t delim = token.find('=');
|
||||
if(delim != std::string::npos)
|
||||
{
|
||||
params.emplace(token.substr(0, delim), token.substr(delim + 1));
|
||||
}
|
||||
});
|
||||
|
||||
size_t offset = 0;
|
||||
size_t size = 0;
|
||||
|
||||
if(auto offset_it = params.find("offset"); offset_it != params.end())
|
||||
offset = std::stoul(offset_it->second, nullptr, 0);
|
||||
|
||||
if(auto size_it = params.find("size"); size_it != params.end())
|
||||
{
|
||||
if(!(size = std::stoul(size_it->second, nullptr, 0))) throw std::exception();
|
||||
}
|
||||
|
||||
return {offset, size};
|
||||
}
|
||||
|
||||
void
|
||||
CodeobjCaptureInstance::reset(codeobj_capture_mode_t mode)
|
||||
{
|
||||
if(static_cast<int>(mode) <= static_cast<int>(capture_mode)) return;
|
||||
|
||||
capture_mode = mode;
|
||||
if(!buffer.empty()) return;
|
||||
|
||||
size_t offset, size;
|
||||
try
|
||||
{
|
||||
std::tie(offset, size) = parse_uri();
|
||||
} catch(...)
|
||||
{
|
||||
std::cerr << "Error parsing URI " << URI << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
if(protocol == "file")
|
||||
{
|
||||
if(mode == ROCPROFILER_CODEOBJ_CAPTURE_COPY_FILE_AND_MEMORY)
|
||||
copyCodeobjFromFile(offset, size, decoded_path);
|
||||
}
|
||||
else if(protocol == "memory")
|
||||
{
|
||||
if(mode != ROCPROFILER_CODEOBJ_CAPTURE_SYMBOLS_ONLY)
|
||||
copyCodeobjFromMemory(mem_addr, mem_size);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("\"%s\" protocol not supported\n", protocol.c_str());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,158 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <fstream>
|
||||
#include <memory>
|
||||
#include <shared_mutex>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
/**
|
||||
* Enum defines how code object is captured for ATT and PC Sampling
|
||||
*/
|
||||
enum codeobj_capture_mode_t
|
||||
{
|
||||
/**
|
||||
* Capture file and memory paths for the loaded code object
|
||||
*/
|
||||
ROCPROFILER_CODEOBJ_CAPTURE_SYMBOLS_ONLY = 0,
|
||||
/**
|
||||
* Capture symbols for file:// and memory:// type objects,
|
||||
* and generate a copy of all kernel code for objects under memory://
|
||||
*/
|
||||
ROCPROFILER_CODEOBJ_CAPTURE_COPY_MEMORY = 1,
|
||||
/**
|
||||
* Capture symbols and all kernel code for file:// and memory:// type objects
|
||||
*/
|
||||
ROCPROFILER_CODEOBJ_CAPTURE_COPY_FILE_AND_MEMORY = 2,
|
||||
ROCPROFILER_CODEOBJ_CAPTURE_LAST = 3,
|
||||
};
|
||||
|
||||
/**
|
||||
* A class to keep track of currently loaded code objects.
|
||||
* Only the public static methods are thread-safe and expected to be used.
|
||||
*/
|
||||
class CodeobjCaptureInstance
|
||||
{
|
||||
public:
|
||||
CodeobjCaptureInstance(uint64_t _addr,
|
||||
uint64_t _load_size,
|
||||
const std::string& _uri,
|
||||
uint64_t _mem_addr,
|
||||
uint64_t _mem_size,
|
||||
uint64_t id,
|
||||
codeobj_capture_mode_t mode)
|
||||
: addr(_addr)
|
||||
, load_size(_load_size)
|
||||
, load_id(id)
|
||||
, URI(_uri)
|
||||
, mem_addr(_mem_addr)
|
||||
, mem_size(_mem_size)
|
||||
{
|
||||
reset(mode);
|
||||
};
|
||||
|
||||
const uint64_t addr;
|
||||
const uint64_t load_size;
|
||||
const uint64_t load_id;
|
||||
|
||||
private:
|
||||
void reset(codeobj_capture_mode_t mode);
|
||||
|
||||
std::pair<size_t, size_t> parse_uri();
|
||||
void DecodePath();
|
||||
void copyCodeobjFromFile(uint64_t offset, uint64_t size, const std::string& path);
|
||||
void copyCodeobjFromMemory(uint64_t, uint64_t);
|
||||
|
||||
std::string URI{};
|
||||
std::string decoded_path{};
|
||||
std::string protocol{};
|
||||
std::vector<char> buffer{};
|
||||
|
||||
uint64_t mem_addr = 0;
|
||||
uint64_t mem_size = 0;
|
||||
codeobj_capture_mode_t capture_mode = ROCPROFILER_CODEOBJ_CAPTURE_SYMBOLS_ONLY;
|
||||
};
|
||||
|
||||
typedef std::shared_ptr<CodeobjCaptureInstance> CodeobjPtr;
|
||||
|
||||
template <>
|
||||
struct std::hash<CodeobjPtr>
|
||||
{
|
||||
uint64_t operator()(const CodeobjPtr& p) const { return p->load_id; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct std::equal_to<CodeobjPtr>
|
||||
{
|
||||
bool operator()(const CodeobjPtr& a, const CodeobjPtr& b) const
|
||||
{
|
||||
return (a->addr == b->addr) && (a->load_id == b->load_id);
|
||||
};
|
||||
};
|
||||
|
||||
/**
|
||||
* A class to keep track of the history of loaded code objets.
|
||||
* Only the public static methods are thread-safe and expected to be used.
|
||||
*/
|
||||
class CodeobjRecorder
|
||||
{
|
||||
public:
|
||||
CodeobjRecorder(codeobj_capture_mode_t mode)
|
||||
: capture_mode(mode){};
|
||||
|
||||
void Load(uint64_t _addr,
|
||||
uint64_t _load_size,
|
||||
const std::string& _uri,
|
||||
uint64_t mem_addr,
|
||||
uint64_t mem_size,
|
||||
uint64_t id);
|
||||
void Load(CodeobjPtr capture)
|
||||
{
|
||||
std::lock_guard<std::shared_mutex> lk(mutex);
|
||||
captures[capture->load_id] = capture;
|
||||
}
|
||||
void Unload(uint64_t id)
|
||||
{
|
||||
std::lock_guard<std::shared_mutex> lk(mutex);
|
||||
captures.erase(id);
|
||||
};
|
||||
|
||||
public:
|
||||
std::shared_mutex mutex;
|
||||
|
||||
std::vector<CodeobjPtr> get()
|
||||
{
|
||||
std::vector<CodeobjPtr> vec;
|
||||
std::shared_lock<std::shared_mutex> lk(mutex);
|
||||
for(auto& [k, v] : captures)
|
||||
vec.push_back(v);
|
||||
return vec;
|
||||
};
|
||||
|
||||
private:
|
||||
codeobj_capture_mode_t capture_mode;
|
||||
std::unordered_map<uint64_t, CodeobjPtr> captures;
|
||||
};
|
||||
@@ -0,0 +1,278 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "lib/rocprofiler-sdk-codeobj/code_printing.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include <cstdarg>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <cxxabi.h>
|
||||
#include <elfutils/libdw.h>
|
||||
#include <hsa/amd_hsa_elf.h>
|
||||
#include <sys/mman.h>
|
||||
|
||||
#include <atomic>
|
||||
|
||||
#define C_API_BEGIN \
|
||||
try \
|
||||
{
|
||||
#define C_API_END(returndata) \
|
||||
} \
|
||||
catch(std::exception & e) \
|
||||
{ \
|
||||
std::string s = e.what(); \
|
||||
if(s.find("memory protocol not supported!") == std::string::npos) \
|
||||
std::cerr << "Codeobj API lookup: " << e.what() << std::endl; \
|
||||
return returndata; \
|
||||
} \
|
||||
catch(std::string & s) \
|
||||
{ \
|
||||
if(s.find("memory protocol not supported!") == std::string::npos) \
|
||||
std::cerr << "Codeobj API lookup: " << s << std::endl; \
|
||||
return returndata; \
|
||||
} \
|
||||
catch(...) { return returndata; }
|
||||
|
||||
CodeobjDecoderComponent::CodeobjDecoderComponent(const char* codeobj_data, uint64_t codeobj_size)
|
||||
{
|
||||
m_fd = -1;
|
||||
#if defined(_GNU_SOURCE) && defined(MFD_ALLOW_SEALING) && defined(MFD_CLOEXEC)
|
||||
m_fd = ::memfd_create(m_uri.c_str(), MFD_ALLOW_SEALING | MFD_CLOEXEC);
|
||||
#endif
|
||||
if(m_fd == -1) // If fail, attempt under /tmp
|
||||
m_fd = ::open("/tmp", O_TMPFILE | O_RDWR, 0666);
|
||||
|
||||
if(m_fd == -1)
|
||||
{
|
||||
printf("could not create a temporary file for code object\n");
|
||||
return;
|
||||
}
|
||||
|
||||
if(size_t size = ::write(m_fd, codeobj_data, codeobj_size); size != codeobj_size)
|
||||
{
|
||||
printf("could not write to the temporary file\n");
|
||||
return;
|
||||
}
|
||||
::lseek(m_fd, 0, SEEK_SET);
|
||||
fsync(m_fd);
|
||||
|
||||
m_line_number_map = {};
|
||||
|
||||
std::unique_ptr<Dwarf, void (*)(Dwarf*)> dbg(dwarf_begin(m_fd, DWARF_C_READ),
|
||||
[](Dwarf* _dbg) { dwarf_end(_dbg); });
|
||||
|
||||
/*if (!dbg) {
|
||||
rocprofiler::warning("Error opening Dwarf!\n");
|
||||
return;
|
||||
} */
|
||||
|
||||
if(dbg)
|
||||
{
|
||||
Dwarf_Off cu_offset{0}, next_offset;
|
||||
size_t header_size;
|
||||
|
||||
std::unordered_set<uint64_t> used_addrs;
|
||||
|
||||
while(!dwarf_nextcu(
|
||||
dbg.get(), cu_offset, &next_offset, &header_size, nullptr, nullptr, nullptr))
|
||||
{
|
||||
Dwarf_Die die;
|
||||
if(!dwarf_offdie(dbg.get(), cu_offset + header_size, &die)) continue;
|
||||
|
||||
Dwarf_Lines* lines;
|
||||
size_t line_count;
|
||||
if(dwarf_getsrclines(&die, &lines, &line_count)) continue;
|
||||
|
||||
for(size_t i = 0; i < line_count; ++i)
|
||||
{
|
||||
Dwarf_Addr addr;
|
||||
int line_number;
|
||||
Dwarf_Line* line = dwarf_onesrcline(lines, i);
|
||||
|
||||
if(line && !dwarf_lineaddr(line, &addr) && !dwarf_lineno(line, &line_number) &&
|
||||
line_number)
|
||||
{
|
||||
std::string src = dwarf_linesrc(line, nullptr, nullptr);
|
||||
auto dwarf_line = src + ':' + std::to_string(line_number);
|
||||
|
||||
if(used_addrs.find(addr) != used_addrs.end())
|
||||
{
|
||||
size_t pos = m_line_number_map.lower_bound(addr);
|
||||
m_line_number_map.data()[pos].str += ' ' + dwarf_line;
|
||||
continue;
|
||||
}
|
||||
|
||||
used_addrs.insert(addr);
|
||||
m_line_number_map.insert(DSourceLine{addr, 0, std::move(dwarf_line)});
|
||||
}
|
||||
}
|
||||
cu_offset = next_offset;
|
||||
}
|
||||
}
|
||||
|
||||
// Can throw
|
||||
disassembly = std::make_unique<DisassemblyInstance>(codeobj_data, codeobj_size);
|
||||
if(m_line_number_map.size())
|
||||
{
|
||||
size_t total_size = 0;
|
||||
for(size_t i = 0; i < m_line_number_map.size() - 1; i++)
|
||||
{
|
||||
size_t s = m_line_number_map.get(i + 1).vaddr - m_line_number_map.get(i).vaddr;
|
||||
m_line_number_map.data()[i].size = s;
|
||||
total_size += s;
|
||||
}
|
||||
m_line_number_map.back().size = std::max(total_size, codeobj_size) - total_size;
|
||||
}
|
||||
try
|
||||
{
|
||||
m_symbol_map = disassembly->GetKernelMap(); // Can throw
|
||||
} catch(...)
|
||||
{}
|
||||
|
||||
// disassemble_kernels();
|
||||
}
|
||||
|
||||
CodeobjDecoderComponent::~CodeobjDecoderComponent()
|
||||
{
|
||||
if(m_fd) ::close(m_fd);
|
||||
}
|
||||
|
||||
std::shared_ptr<Instruction>
|
||||
CodeobjDecoderComponent::disassemble_instruction(uint64_t faddr, uint64_t vaddr)
|
||||
{
|
||||
if(!disassembly) throw std::exception();
|
||||
|
||||
const char* cpp_line = nullptr;
|
||||
|
||||
try
|
||||
{
|
||||
const DSourceLine& it = m_line_number_map.find_obj(vaddr);
|
||||
cpp_line = it.str.data();
|
||||
} catch(...)
|
||||
{}
|
||||
|
||||
auto pair = disassembly->ReadInstruction(faddr);
|
||||
auto inst = std::make_shared<Instruction>(std::move(pair.first), pair.second);
|
||||
inst->faddr = faddr;
|
||||
inst->vaddr = vaddr;
|
||||
|
||||
if(cpp_line) inst->comment = cpp_line;
|
||||
return inst;
|
||||
}
|
||||
|
||||
LoadedCodeobjDecoder::LoadedCodeobjDecoder(const char* filepath,
|
||||
uint64_t _load_addr,
|
||||
uint64_t mem_size)
|
||||
: load_addr(_load_addr)
|
||||
, load_end(load_addr + mem_size)
|
||||
{
|
||||
if(!filepath) throw "Empty filepath.";
|
||||
|
||||
std::string_view fpath(filepath);
|
||||
|
||||
if(fpath.rfind(".out") + 4 == fpath.size())
|
||||
{
|
||||
std::ifstream file(filepath, std::ios::in | std::ios::binary);
|
||||
|
||||
if(!file.is_open()) throw "Invalid filename " + std::string(filepath);
|
||||
|
||||
std::vector<char> buffer;
|
||||
file.seekg(0, file.end);
|
||||
buffer.resize(file.tellg());
|
||||
file.seekg(0, file.beg);
|
||||
file.read(buffer.data(), buffer.size());
|
||||
|
||||
decoder = std::make_unique<CodeobjDecoderComponent>(buffer.data(), buffer.size());
|
||||
}
|
||||
else
|
||||
{
|
||||
std::unique_ptr<CodeObjectBinary> binary = std::make_unique<CodeObjectBinary>(filepath);
|
||||
auto& buffer = binary->buffer;
|
||||
decoder = std::make_unique<CodeobjDecoderComponent>(buffer.data(), buffer.size());
|
||||
}
|
||||
|
||||
elf_segments = decoder->disassembly->getSegments();
|
||||
}
|
||||
|
||||
LoadedCodeobjDecoder::LoadedCodeobjDecoder(const void* data,
|
||||
size_t size,
|
||||
uint64_t _load_addr,
|
||||
uint64_t mem_size)
|
||||
: load_addr(_load_addr)
|
||||
, load_end(load_addr + mem_size)
|
||||
{
|
||||
decoder = std::make_unique<CodeobjDecoderComponent>(reinterpret_cast<const char*>(data), size);
|
||||
elf_segments = decoder->disassembly->getSegments();
|
||||
}
|
||||
|
||||
std::shared_ptr<Instruction>
|
||||
LoadedCodeobjDecoder::add_to_map(uint64_t ld_addr)
|
||||
{
|
||||
if(!decoder || ld_addr < load_addr) throw std::out_of_range("Addr not in decoder");
|
||||
|
||||
uint64_t voffset = ld_addr - load_addr;
|
||||
auto faddr = decoder->disassembly->va2fo(voffset);
|
||||
if(!faddr) throw std::out_of_range("Could not find file offset");
|
||||
|
||||
auto shared = decoder->disassemble_instruction(*faddr, voffset);
|
||||
shared->ld_addr = ld_addr;
|
||||
decoded_map[ld_addr] = shared;
|
||||
return shared;
|
||||
}
|
||||
|
||||
std::shared_ptr<Instruction>
|
||||
LoadedCodeobjDecoder::get(uint64_t addr)
|
||||
{
|
||||
if(decoded_map.find(addr) != decoded_map.end()) return decoded_map[addr];
|
||||
|
||||
try
|
||||
{
|
||||
return add_to_map(addr);
|
||||
} catch(std::exception& e)
|
||||
{
|
||||
std::cerr << e.what() << " at addr " << std::hex << addr << std::dec << std::endl;
|
||||
}
|
||||
throw std::out_of_range("Invalid address");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
#define PUBLIC_API __attribute__((visibility("default")))
|
||||
@@ -0,0 +1,257 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "disassembly.hpp"
|
||||
#include "segment.hpp"
|
||||
|
||||
struct Instruction
|
||||
{
|
||||
Instruction() = default;
|
||||
Instruction(std::string&& _inst, size_t _size)
|
||||
: inst(std::move(_inst))
|
||||
, size(_size)
|
||||
{}
|
||||
std::string inst;
|
||||
std::string comment;
|
||||
uint64_t faddr;
|
||||
uint64_t vaddr;
|
||||
uint64_t ld_addr;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
struct DSourceLine
|
||||
{
|
||||
uint64_t vaddr;
|
||||
uint64_t size;
|
||||
std::string str;
|
||||
uint64_t begin() const { return vaddr; }
|
||||
bool inrange(uint64_t addr) const { return addr >= vaddr && addr < vaddr + size; }
|
||||
};
|
||||
|
||||
class CodeobjDecoderComponent
|
||||
{
|
||||
public:
|
||||
CodeobjDecoderComponent(const char* codeobj_data, uint64_t codeobj_size);
|
||||
~CodeobjDecoderComponent();
|
||||
|
||||
std::shared_ptr<Instruction> disassemble_instruction(uint64_t faddr, uint64_t vaddr);
|
||||
int m_fd;
|
||||
|
||||
cached_ordered_vector<DSourceLine> m_line_number_map;
|
||||
std::map<uint64_t, SymbolInfo> m_symbol_map{};
|
||||
|
||||
std::string m_uri;
|
||||
std::vector<std::shared_ptr<Instruction>> instructions{};
|
||||
std::unique_ptr<DisassemblyInstance> disassembly{};
|
||||
};
|
||||
|
||||
class LoadedCodeobjDecoder
|
||||
{
|
||||
public:
|
||||
LoadedCodeobjDecoder(const char* filepath, uint64_t load_addr, uint64_t memsize);
|
||||
LoadedCodeobjDecoder(const void* data, uint64_t size, uint64_t load_addr, size_t memsize);
|
||||
std::shared_ptr<Instruction> add_to_map(uint64_t ld_addr);
|
||||
|
||||
std::shared_ptr<Instruction> get(uint64_t addr);
|
||||
uint64_t begin() const { return load_addr; };
|
||||
uint64_t end() const { return load_end; }
|
||||
uint64_t size() const { return load_end - load_addr; }
|
||||
bool inrange(uint64_t addr) const { return addr >= begin() && addr < end(); }
|
||||
|
||||
const char* getSymbolName(uint64_t addr) const
|
||||
{
|
||||
if(!decoder) return nullptr;
|
||||
|
||||
auto it = decoder->m_symbol_map.find(addr - load_addr);
|
||||
if(it != decoder->m_symbol_map.end()) return it->second.name.data();
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::map<uint64_t, SymbolInfo>& getSymbolMap() const
|
||||
{
|
||||
if(!decoder) throw std::exception();
|
||||
return decoder->m_symbol_map;
|
||||
}
|
||||
std::vector<std::pair<uint64_t, uint64_t>> elf_segments{};
|
||||
const uint64_t load_addr;
|
||||
|
||||
private:
|
||||
uint64_t load_end = 0;
|
||||
|
||||
std::unordered_map<uint64_t, std::shared_ptr<Instruction>> decoded_map;
|
||||
std::unique_ptr<CodeobjDecoderComponent> decoder{nullptr};
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Maps ID and offsets into instructions
|
||||
*/
|
||||
class CodeobjMap
|
||||
{
|
||||
public:
|
||||
CodeobjMap() = default;
|
||||
|
||||
virtual void addDecoder(const char* filepath,
|
||||
codeobj_marker_id_t id,
|
||||
uint64_t load_addr,
|
||||
uint64_t memsize)
|
||||
{
|
||||
decoders[id] = std::make_shared<LoadedCodeobjDecoder>(filepath, load_addr, memsize);
|
||||
}
|
||||
|
||||
virtual void addDecoder(const void* data,
|
||||
size_t memory_size,
|
||||
codeobj_marker_id_t id,
|
||||
uint64_t load_addr,
|
||||
uint64_t memsize)
|
||||
{
|
||||
decoders[id] =
|
||||
std::make_shared<LoadedCodeobjDecoder>(data, memory_size, load_addr, memsize);
|
||||
}
|
||||
|
||||
virtual bool removeDecoderbyId(codeobj_marker_id_t id) { return decoders.erase(id) != 0; }
|
||||
|
||||
std::shared_ptr<Instruction> get(codeobj_marker_id_t id, uint64_t offset)
|
||||
{
|
||||
auto& decoder = decoders.at(id);
|
||||
return decoder->get(decoder->begin() + offset);
|
||||
}
|
||||
|
||||
const char* getSymbolName(codeobj_marker_id_t id, uint64_t offset)
|
||||
{
|
||||
auto& decoder = decoders.at(id);
|
||||
uint64_t vaddr = decoder->begin() + offset;
|
||||
if(decoder->inrange(vaddr)) return decoder->getSymbolName(vaddr);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
protected:
|
||||
std::unordered_map<codeobj_marker_id_t, std::shared_ptr<LoadedCodeobjDecoder>> decoders{};
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Translates virtual addresses to elf file offsets
|
||||
*/
|
||||
class CodeobjAddressTranslate : protected CodeobjMap
|
||||
{
|
||||
using Super = CodeobjMap;
|
||||
|
||||
public:
|
||||
CodeobjAddressTranslate() = default;
|
||||
|
||||
virtual void addDecoder(const char* filepath,
|
||||
codeobj_marker_id_t id,
|
||||
uint64_t load_addr,
|
||||
uint64_t memsize) override
|
||||
{
|
||||
this->Super::addDecoder(filepath, id, load_addr, memsize);
|
||||
auto ptr = decoders.at(id);
|
||||
table.insert({ptr->begin(), ptr->size(), id, 0});
|
||||
}
|
||||
|
||||
virtual void addDecoder(const void* data,
|
||||
size_t memory_size,
|
||||
codeobj_marker_id_t id,
|
||||
uint64_t load_addr,
|
||||
uint64_t memsize) override
|
||||
{
|
||||
this->Super::addDecoder(data, memory_size, id, load_addr, memsize);
|
||||
auto ptr = decoders.at(id);
|
||||
table.insert({ptr->begin(), ptr->size(), id, 0});
|
||||
}
|
||||
|
||||
virtual bool removeDecoder(codeobj_marker_id_t id, uint64_t load_addr)
|
||||
{
|
||||
return table.remove(load_addr) && this->Super::removeDecoderbyId(id);
|
||||
}
|
||||
|
||||
std::shared_ptr<Instruction> get(uint64_t vaddr)
|
||||
{
|
||||
auto& addr_range = table.find_codeobj_in_range(vaddr);
|
||||
return this->Super::get(addr_range.id, vaddr - addr_range.vbegin);
|
||||
}
|
||||
|
||||
std::shared_ptr<Instruction> get(codeobj_marker_id_t id, uint64_t offset)
|
||||
{
|
||||
if(id == 0)
|
||||
return get(offset);
|
||||
else
|
||||
return this->Super::get(id, offset);
|
||||
}
|
||||
|
||||
const char* getSymbolName(uint64_t vaddr)
|
||||
{
|
||||
for(auto& [_, decoder] : decoders)
|
||||
{
|
||||
if(!decoder->inrange(vaddr)) continue;
|
||||
return decoder->getSymbolName(vaddr);
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void getSymbolMap(const std::shared_ptr<LoadedCodeobjDecoder>& dec,
|
||||
std::unordered_map<uint64_t, SymbolInfo>& symbols) const
|
||||
{
|
||||
try
|
||||
{
|
||||
auto& smap = dec->getSymbolMap();
|
||||
for(auto& [vaddr, sym] : smap)
|
||||
symbols[vaddr + dec->load_addr] = sym;
|
||||
} catch(std::exception& e)
|
||||
{
|
||||
return;
|
||||
};
|
||||
}
|
||||
|
||||
std::unordered_map<uint64_t, SymbolInfo> getSymbolMap() const
|
||||
{
|
||||
std::unordered_map<uint64_t, SymbolInfo> symbols;
|
||||
|
||||
for(auto& [_, dec] : decoders)
|
||||
this->getSymbolMap(dec, symbols);
|
||||
|
||||
return symbols;
|
||||
}
|
||||
|
||||
std::unordered_map<uint64_t, SymbolInfo> getSymbolMap(codeobj_marker_id_t id) const
|
||||
{
|
||||
std::unordered_map<uint64_t, SymbolInfo> symbols;
|
||||
|
||||
auto it = decoders.find(id);
|
||||
if(it == decoders.end()) return symbols;
|
||||
|
||||
this->getSymbolMap(it->second, symbols);
|
||||
return symbols;
|
||||
}
|
||||
|
||||
private:
|
||||
CodeobjTableTranslator table;
|
||||
};
|
||||
@@ -0,0 +1,372 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#if !defined(_GNU_SOURCE) || !defined(_XOPEN_SOURCE)
|
||||
# define _XOPEN_SOURCE 700
|
||||
#endif
|
||||
|
||||
#include <cxxabi.h>
|
||||
#include <elf.h>
|
||||
#include <fcntl.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdarg>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
|
||||
#include <elfutils/libdw.h>
|
||||
#include <hsa/amd_hsa_elf.h>
|
||||
#include "lib/rocprofiler-sdk-codeobj/code_printing.hpp"
|
||||
|
||||
#define THROW_COMGR(call) \
|
||||
if(amd_comgr_status_s status = call) \
|
||||
{ \
|
||||
const char* reason = ""; \
|
||||
amd_comgr_status_string(status, &reason); \
|
||||
std::cerr << __FILE__ << ':' << __LINE__ << " code: " << status << " failed: " << reason \
|
||||
<< std::endl; \
|
||||
throw std::exception(); \
|
||||
}
|
||||
|
||||
#define RETURN_COMGR(call) \
|
||||
if(amd_comgr_status_s status = call) \
|
||||
{ \
|
||||
const char* reason = ""; \
|
||||
amd_comgr_status_string(status, &reason); \
|
||||
std::cerr << __FILE__ << ':' << __LINE__ << " code: " << status << " failed: " << reason \
|
||||
<< std::endl; \
|
||||
return AMD_COMGR_STATUS_ERROR; \
|
||||
}
|
||||
|
||||
CodeObjectBinary::CodeObjectBinary(const std::string& uri)
|
||||
: m_uri(uri)
|
||||
{
|
||||
const std::string protocol_delim{"://"};
|
||||
|
||||
size_t protocol_end = m_uri.find(protocol_delim);
|
||||
std::string protocol = m_uri.substr(0, protocol_end);
|
||||
protocol_end += protocol_delim.length();
|
||||
|
||||
std::transform(protocol.begin(), protocol.end(), protocol.begin(), [](unsigned char c) {
|
||||
return std::tolower(c);
|
||||
});
|
||||
|
||||
std::string path;
|
||||
size_t path_end = m_uri.find_first_of("#?", protocol_end);
|
||||
if(path_end != std::string::npos)
|
||||
{
|
||||
path = m_uri.substr(protocol_end, path_end++ - protocol_end);
|
||||
}
|
||||
else
|
||||
{
|
||||
path = m_uri.substr(protocol_end);
|
||||
}
|
||||
|
||||
/* %-decode the string. */
|
||||
std::string decoded_path;
|
||||
decoded_path.reserve(path.length());
|
||||
for(size_t i = 0; i < path.length(); ++i)
|
||||
{
|
||||
if(path[i] == '%' && std::isxdigit(path[i + 1]) && std::isxdigit(path[i + 2]))
|
||||
{
|
||||
decoded_path += std::stoi(path.substr(i + 1, 2), 0, 16);
|
||||
i += 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
decoded_path += path[i];
|
||||
}
|
||||
}
|
||||
|
||||
/* Tokenize the query/fragment. */
|
||||
std::vector<std::string> tokens;
|
||||
size_t pos, last = path_end;
|
||||
while((pos = m_uri.find('&', last)) != std::string::npos)
|
||||
{
|
||||
tokens.emplace_back(m_uri.substr(last, pos - last));
|
||||
last = pos + 1;
|
||||
}
|
||||
if(last != std::string::npos)
|
||||
{
|
||||
tokens.emplace_back(m_uri.substr(last));
|
||||
}
|
||||
|
||||
/* Create a tag-value map from the tokenized query/fragment. */
|
||||
std::unordered_map<std::string, std::string> params;
|
||||
std::for_each(tokens.begin(), tokens.end(), [&](std::string& token) {
|
||||
size_t delim = token.find('=');
|
||||
if(delim != std::string::npos)
|
||||
{
|
||||
params.emplace(token.substr(0, delim), token.substr(delim + 1));
|
||||
}
|
||||
});
|
||||
|
||||
buffer = std::vector<char>{};
|
||||
size_t offset = 0;
|
||||
size_t size = 0;
|
||||
|
||||
if(auto offset_it = params.find("offset"); offset_it != params.end())
|
||||
{
|
||||
offset = std::stoul(offset_it->second, nullptr, 0);
|
||||
}
|
||||
|
||||
if(auto size_it = params.find("size"); size_it != params.end())
|
||||
{
|
||||
if(!(size = std::stoul(size_it->second, nullptr, 0))) return;
|
||||
}
|
||||
|
||||
if(protocol != "file") throw protocol + " protocol not supported!";
|
||||
|
||||
std::ifstream file(decoded_path, std::ios::in | std::ios::binary);
|
||||
if(!file || !file.is_open()) throw "could not open " + decoded_path;
|
||||
|
||||
if(!size)
|
||||
{
|
||||
file.ignore(std::numeric_limits<std::streamsize>::max());
|
||||
size_t bytes = file.gcount();
|
||||
file.clear();
|
||||
|
||||
if(bytes < offset) throw "invalid uri " + decoded_path + " (file size < offset)";
|
||||
|
||||
size = bytes - offset;
|
||||
}
|
||||
|
||||
file.seekg(offset, std::ios_base::beg);
|
||||
buffer.resize(size);
|
||||
file.read(&buffer[0], size);
|
||||
}
|
||||
|
||||
DisassemblyInstance::DisassemblyInstance(const char* codeobj_data, uint64_t codeobj_size)
|
||||
{
|
||||
buffer = std::vector<char>(codeobj_size, 0);
|
||||
std::memcpy(buffer.data(), codeobj_data, codeobj_size);
|
||||
|
||||
THROW_COMGR(amd_comgr_create_data(AMD_COMGR_DATA_KIND_EXECUTABLE, &data));
|
||||
THROW_COMGR(amd_comgr_set_data(data, buffer.size(), buffer.data()));
|
||||
|
||||
size_t isa_size = 128;
|
||||
std::string input_isa{};
|
||||
input_isa.resize(isa_size);
|
||||
THROW_COMGR(amd_comgr_get_data_isa_name(data, &isa_size, input_isa.data()));
|
||||
|
||||
THROW_COMGR(amd_comgr_create_disassembly_info(
|
||||
input_isa.data(),
|
||||
&DisassemblyInstance::memory_callback,
|
||||
&DisassemblyInstance::inst_callback,
|
||||
[](uint64_t, void*) {},
|
||||
&info));
|
||||
}
|
||||
|
||||
amd_comgr_status_t
|
||||
DisassemblyInstance::symbol_callback(amd_comgr_symbol_t symbol, void* user_data)
|
||||
{
|
||||
amd_comgr_symbol_type_t type;
|
||||
RETURN_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_TYPE, &type));
|
||||
|
||||
if(type != AMD_COMGR_SYMBOL_TYPE_FUNC) return AMD_COMGR_STATUS_SUCCESS;
|
||||
|
||||
uint64_t vaddr = 0;
|
||||
uint64_t mem_size = 0;
|
||||
uint64_t name_size = 0;
|
||||
RETURN_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_VALUE, &vaddr));
|
||||
RETURN_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_SIZE, &mem_size));
|
||||
RETURN_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_NAME_LENGTH, &name_size));
|
||||
|
||||
std::string name;
|
||||
name.resize(name_size);
|
||||
|
||||
RETURN_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_NAME, name.data()));
|
||||
|
||||
DisassemblyInstance& instance = *static_cast<DisassemblyInstance*>(user_data);
|
||||
std::optional<uint64_t> faddr = instance.va2fo(vaddr);
|
||||
|
||||
if(faddr) instance.symbol_map[vaddr] = {name, *faddr, vaddr, mem_size};
|
||||
return AMD_COMGR_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
std::map<uint64_t, SymbolInfo>&
|
||||
DisassemblyInstance::GetKernelMap()
|
||||
{
|
||||
symbol_map = {};
|
||||
THROW_COMGR(amd_comgr_iterate_symbols(data, &DisassemblyInstance::symbol_callback, this));
|
||||
|
||||
return symbol_map;
|
||||
}
|
||||
|
||||
DisassemblyInstance::~DisassemblyInstance()
|
||||
{
|
||||
amd_comgr_release_data(data);
|
||||
amd_comgr_destroy_disassembly_info(info);
|
||||
}
|
||||
|
||||
std::pair<std::string, size_t>
|
||||
DisassemblyInstance::ReadInstruction(uint64_t faddr)
|
||||
{
|
||||
uint64_t size_read;
|
||||
uint64_t addr_in_buffer = reinterpret_cast<uint64_t>(buffer.data()) + faddr;
|
||||
|
||||
THROW_COMGR(amd_comgr_disassemble_instruction(info, addr_in_buffer, (void*) this, &size_read));
|
||||
return {std::move(this->last_instruction), size_read};
|
||||
}
|
||||
|
||||
uint64_t
|
||||
DisassemblyInstance::memory_callback(uint64_t from, char* to, uint64_t size, void* user_data)
|
||||
{
|
||||
DisassemblyInstance& instance = *static_cast<DisassemblyInstance*>(user_data);
|
||||
int64_t copysize = reinterpret_cast<int64_t>(instance.buffer.data()) + instance.buffer.size() -
|
||||
static_cast<int64_t>(from);
|
||||
copysize = std::min<int64_t>(size, copysize);
|
||||
std::memcpy(to, (char*) from, copysize);
|
||||
return copysize;
|
||||
}
|
||||
|
||||
void
|
||||
DisassemblyInstance::inst_callback(const char* instruction, void* user_data)
|
||||
{
|
||||
DisassemblyInstance& instance = *static_cast<DisassemblyInstance*>(user_data);
|
||||
|
||||
if(!instruction) return;
|
||||
|
||||
while(*instruction == '\t' || *instruction == ' ')
|
||||
instruction++;
|
||||
instance.last_instruction = instruction;
|
||||
}
|
||||
|
||||
#define CHECK_VA2FO(x, msg) \
|
||||
if(!(x)) \
|
||||
{ \
|
||||
std::cerr << __FILE__ << ' ' << __LINE__ << ' ' << msg << std::endl; \
|
||||
return std::nullopt; \
|
||||
}
|
||||
|
||||
// mem - input argument, start of the elf
|
||||
// va - input argument, virtual address
|
||||
// return file offset, if found
|
||||
std::optional<uint64_t>
|
||||
DisassemblyInstance::va2fo(uint64_t va)
|
||||
{
|
||||
CHECK_VA2FO(buffer.size() > sizeof(Elf64_Ehdr), "buffer is not large enough");
|
||||
|
||||
uint8_t* e_ident = (uint8_t*) buffer.data();
|
||||
CHECK_VA2FO(e_ident, "e_ident is nullptr");
|
||||
|
||||
CHECK_VA2FO(e_ident[EI_MAG0] == ELFMAG0 || e_ident[EI_MAG1] == ELFMAG1 ||
|
||||
e_ident[EI_MAG2] == ELFMAG2 || e_ident[EI_MAG3] == ELFMAG3,
|
||||
"unexpected ei_mag");
|
||||
|
||||
CHECK_VA2FO(e_ident[EI_CLASS] == ELFCLASS64, "unexpected ei_class");
|
||||
CHECK_VA2FO(e_ident[EI_DATA] == ELFDATA2LSB, "unexpected ei_data");
|
||||
CHECK_VA2FO(e_ident[EI_VERSION] == EV_CURRENT, "unexpected ei_version");
|
||||
CHECK_VA2FO(e_ident[EI_OSABI] == 64, "unexpected ei_osabi"); // ELFOSABI_AMDGPU_HSA
|
||||
|
||||
CHECK_VA2FO(e_ident[EI_ABIVERSION] == 2 || // ELFABIVERSION_AMDGPU_HSA_V4
|
||||
e_ident[EI_ABIVERSION] == 3,
|
||||
"unexpected ei_abiversion"); // ELFABIVERSION_AMDGPU_HSA_V5
|
||||
|
||||
Elf64_Ehdr* ehdr = (Elf64_Ehdr*) buffer.data();
|
||||
CHECK_VA2FO(ehdr, "ehdr is nullptr");
|
||||
CHECK_VA2FO(ehdr->e_type == ET_DYN, "unexpected e_type");
|
||||
CHECK_VA2FO(ehdr->e_machine == ELF::EM_AMDGPU, "unexpected e_machine");
|
||||
CHECK_VA2FO(ehdr->e_phoff != 0, "unexpected e_phoff");
|
||||
|
||||
CHECK_VA2FO(buffer.size() > ehdr->e_phoff + sizeof(Elf64_Phdr), "buffer is not large enough");
|
||||
|
||||
Elf64_Phdr* phdr = (Elf64_Phdr*) ((uint8_t*) buffer.data() + ehdr->e_phoff);
|
||||
CHECK_VA2FO(phdr, "phdr is nullptr");
|
||||
|
||||
for(uint16_t i = 0; i < ehdr->e_phnum; ++i)
|
||||
{
|
||||
if(phdr[i].p_type != PT_LOAD) continue;
|
||||
if(va < phdr[i].p_vaddr || va >= (phdr[i].p_vaddr + phdr[i].p_memsz)) continue;
|
||||
|
||||
return va + phdr[i].p_offset - phdr[i].p_vaddr;
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
#undef CHECK_VA2FO
|
||||
#define CHECK_VA2FO(x, msg) \
|
||||
if(!(x)) \
|
||||
{ \
|
||||
std::cerr << __FILE__ << ' ' << __LINE__ << ' ' << msg << std::endl; \
|
||||
return {}; \
|
||||
}
|
||||
|
||||
std::vector<std::pair<uint64_t, uint64_t>>
|
||||
DisassemblyInstance::getSegments()
|
||||
{
|
||||
CHECK_VA2FO(buffer.size() > sizeof(Elf64_Ehdr), "buffer is not large enough");
|
||||
|
||||
uint8_t* e_ident = (uint8_t*) buffer.data();
|
||||
CHECK_VA2FO(e_ident, "e_ident is nullptr");
|
||||
|
||||
CHECK_VA2FO(e_ident[EI_MAG0] == ELFMAG0 || e_ident[EI_MAG1] == ELFMAG1 ||
|
||||
e_ident[EI_MAG2] == ELFMAG2 || e_ident[EI_MAG3] == ELFMAG3,
|
||||
"unexpected ei_mag");
|
||||
|
||||
CHECK_VA2FO(e_ident[EI_CLASS] == ELFCLASS64, "unexpected ei_class");
|
||||
CHECK_VA2FO(e_ident[EI_DATA] == ELFDATA2LSB, "unexpected ei_data");
|
||||
CHECK_VA2FO(e_ident[EI_VERSION] == EV_CURRENT, "unexpected ei_version");
|
||||
CHECK_VA2FO(e_ident[EI_OSABI] == 64, "unexpected ei_osabi"); // ELFOSABI_AMDGPU_HSA
|
||||
|
||||
CHECK_VA2FO(e_ident[EI_ABIVERSION] == 2 || // ELFABIVERSION_AMDGPU_HSA_V4
|
||||
e_ident[EI_ABIVERSION] == 3,
|
||||
"unexpected ei_abiversion"); // ELFABIVERSION_AMDGPU_HSA_V5
|
||||
|
||||
Elf64_Ehdr* ehdr = (Elf64_Ehdr*) buffer.data();
|
||||
CHECK_VA2FO(ehdr, "ehdr is nullptr");
|
||||
CHECK_VA2FO(ehdr->e_type == ET_DYN, "unexpected e_type");
|
||||
CHECK_VA2FO(ehdr->e_machine == ELF::EM_AMDGPU, "unexpected e_machine");
|
||||
CHECK_VA2FO(ehdr->e_phoff != 0, "unexpected e_phoff");
|
||||
|
||||
CHECK_VA2FO(buffer.size() > ehdr->e_phoff + sizeof(Elf64_Phdr), "buffer is not large enough");
|
||||
|
||||
Elf64_Phdr* phdr = (Elf64_Phdr*) ((uint8_t*) buffer.data() + ehdr->e_phoff);
|
||||
CHECK_VA2FO(phdr, "phdr is nullptr");
|
||||
|
||||
std::vector<std::pair<uint64_t, uint64_t>> segments;
|
||||
for(Elf64_Half i = 0; i < ehdr->e_phnum; ++i)
|
||||
{
|
||||
if(phdr[i].p_type != PT_LOAD) continue;
|
||||
|
||||
segments.push_back({phdr[i].p_vaddr - phdr[i].p_offset, phdr[i].p_memsz});
|
||||
}
|
||||
|
||||
return segments;
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <amd_comgr/amd_comgr.h>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
class CodeObjectBinary
|
||||
{
|
||||
public:
|
||||
CodeObjectBinary(const std::string& uri);
|
||||
std::string m_uri;
|
||||
std::vector<char> buffer;
|
||||
};
|
||||
|
||||
struct SymbolInfo
|
||||
{
|
||||
std::string name{};
|
||||
uint64_t faddr = 0;
|
||||
uint64_t vaddr = 0;
|
||||
uint64_t mem_size = 0;
|
||||
};
|
||||
|
||||
class DisassemblyInstance
|
||||
{
|
||||
public:
|
||||
DisassemblyInstance(const char* codeobj_data, uint64_t codeobj_size);
|
||||
~DisassemblyInstance();
|
||||
|
||||
std::pair<std::string, size_t> ReadInstruction(uint64_t faddr);
|
||||
std::map<uint64_t, SymbolInfo>& GetKernelMap();
|
||||
|
||||
static uint64_t memory_callback(uint64_t from, char* to, uint64_t size, void* user_data);
|
||||
static void inst_callback(const char* instruction, void* user_data);
|
||||
static amd_comgr_status_t symbol_callback(amd_comgr_symbol_t symbol, void* user_data);
|
||||
|
||||
std::optional<uint64_t> va2fo(uint64_t va);
|
||||
std::vector<std::pair<uint64_t, uint64_t>> getSegments();
|
||||
|
||||
std::vector<char> buffer;
|
||||
std::string last_instruction;
|
||||
amd_comgr_disassembly_info_t info;
|
||||
amd_comgr_data_t data;
|
||||
std::map<uint64_t, SymbolInfo> symbol_map;
|
||||
};
|
||||
@@ -0,0 +1,162 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
using codeobj_marker_id_t = size_t;
|
||||
|
||||
template <typename Type>
|
||||
class ordered_vector : public std::vector<Type>
|
||||
{
|
||||
using Super = std::vector<Type>;
|
||||
|
||||
public:
|
||||
void insert(const Type& elem)
|
||||
{
|
||||
size_t loc = lower_bound(elem.begin());
|
||||
if(this->size() && get(loc).begin() < elem.begin()) loc++;
|
||||
this->Super::insert(this->begin() + loc, elem);
|
||||
}
|
||||
bool remove(const Type& elem)
|
||||
{
|
||||
if(!this->size()) return false;
|
||||
size_t loc = lower_bound(elem.begin());
|
||||
if(get(loc) != elem) return false;
|
||||
|
||||
this->Super::erase(this->begin() + loc);
|
||||
return true;
|
||||
}
|
||||
bool remove(uint64_t elem_begin)
|
||||
{
|
||||
if(!this->size()) return false;
|
||||
size_t loc = lower_bound(elem_begin);
|
||||
if(get(loc).begin() != elem_begin) return false;
|
||||
|
||||
this->Super::erase(this->begin() + loc);
|
||||
return true;
|
||||
}
|
||||
size_t lower_bound(size_t addr) const
|
||||
{
|
||||
if(!this->size()) return 0;
|
||||
return binary_search(addr, 0, this->size() - 1);
|
||||
}
|
||||
|
||||
size_t binary_search(size_t addr, size_t s, size_t e) const
|
||||
{
|
||||
if(s >= e)
|
||||
return s;
|
||||
else if(s + 1 == e)
|
||||
return (get(e).begin() <= addr) ? e : s;
|
||||
|
||||
size_t mid = (s + e) / 2;
|
||||
if(get(mid).begin() <= addr)
|
||||
return binary_search(addr, mid, e);
|
||||
else
|
||||
return binary_search(addr, s, mid);
|
||||
}
|
||||
const Type& get(size_t i) const { return this->operator[](i); }
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Finds a candidate codeobj for the given vaddr
|
||||
*/
|
||||
template <typename Type>
|
||||
class cached_ordered_vector : public ordered_vector<Type>
|
||||
{
|
||||
using Super = ordered_vector<Type>;
|
||||
|
||||
public:
|
||||
cached_ordered_vector<Type>() { reset(); }
|
||||
|
||||
const Type& find_obj(uint64_t addr)
|
||||
{
|
||||
if(testCache(addr)) return get(cached_segment);
|
||||
|
||||
size_t lb = this->lower_bound(addr);
|
||||
if(lb >= this->size() || !get(lb).inrange(addr))
|
||||
throw std::string("segment addr out of range");
|
||||
|
||||
cached_segment = lb;
|
||||
return get(cached_segment);
|
||||
}
|
||||
|
||||
uint64_t find_addr(uint64_t addr) { return find_obj(addr).begin(); }
|
||||
|
||||
bool testCache(uint64_t addr) const
|
||||
{
|
||||
return this->cached_segment < this->size() && get(cached_segment).inrange(addr);
|
||||
}
|
||||
|
||||
const Type& get(size_t index) const { return this->data()[index]; }
|
||||
|
||||
void insert(const Type& elem) { this->Super::insert(elem); }
|
||||
void insert_list(std::vector<Type> arange)
|
||||
{
|
||||
for(auto& elem : arange)
|
||||
push_back(elem);
|
||||
std::sort(this->begin(), this->end(), [](const Type& a, const Type& b) {
|
||||
return a.begin() < b.begin();
|
||||
});
|
||||
};
|
||||
|
||||
void reset() { cached_segment = ~0; }
|
||||
void clear()
|
||||
{
|
||||
reset();
|
||||
this->Super::clear();
|
||||
}
|
||||
bool remove(uint64_t addr)
|
||||
{
|
||||
reset();
|
||||
return this->Super::remove(addr);
|
||||
}
|
||||
|
||||
private:
|
||||
size_t cached_segment = ~0;
|
||||
};
|
||||
|
||||
struct address_range_t
|
||||
{
|
||||
uint64_t vbegin;
|
||||
uint64_t size;
|
||||
codeobj_marker_id_t id;
|
||||
uint64_t offset;
|
||||
|
||||
bool operator<(const address_range_t& other) const { return vbegin < other.vbegin; }
|
||||
bool inrange(uint64_t addr) const { return addr >= vbegin && addr < vbegin + size; };
|
||||
uint64_t begin() const { return vbegin; }
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Finds a candidate codeobj for the given vaddr
|
||||
*/
|
||||
class CodeobjTableTranslator : public cached_ordered_vector<address_range_t>
|
||||
{
|
||||
public:
|
||||
const address_range_t& find_codeobj_in_range(uint64_t addr) { return this->find_obj(addr); }
|
||||
};
|
||||
@@ -42,6 +42,7 @@ add_subdirectory(counters)
|
||||
add_subdirectory(aql)
|
||||
add_subdirectory(pc_sampling)
|
||||
add_subdirectory(marker)
|
||||
add_subdirectory(thread_trace)
|
||||
|
||||
target_link_libraries(
|
||||
rocprofiler-object-library
|
||||
|
||||
@@ -1,17 +1,352 @@
|
||||
// This file should be removed when it appears in AQL Profile
|
||||
#pragma ONCE
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <hsa/hsa.h>
|
||||
#include <hsa/hsa_ven_amd_aqlprofile.h>
|
||||
|
||||
#define PUBLIC_API __attribute__((visibility("default")))
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint64_t handle;
|
||||
} aqlprofile_handle_t;
|
||||
|
||||
typedef enum
|
||||
{
|
||||
AQLPROFILE_MEMORY_HINT_NONE = 0,
|
||||
AQLPROFILE_MEMORY_HINT_HOST = 1,
|
||||
AQLPROFILE_MEMORY_HINT_DEVICE_UNCACHED = 2,
|
||||
AQLPROFILE_MEMORY_HINT_DEVICE_COHERENT = 3,
|
||||
AQLPROFILE_MEMORY_HINT_DEVICE_NONCOHERENT = 4,
|
||||
AQLPROFILE_MEMORY_HINT_LAST
|
||||
} aqlprofile_memory_hint_t;
|
||||
|
||||
/**
|
||||
* @brief Flags to describe which agents can access given buffer.
|
||||
*/
|
||||
typedef union
|
||||
{
|
||||
uint32_t raw;
|
||||
struct
|
||||
{
|
||||
uint32_t device_access : 1;
|
||||
uint32_t host_access : 1;
|
||||
uint32_t memory_hint : 6; // One of aqlprofile_memory_hint_t
|
||||
uint32_t _reserved : 24;
|
||||
};
|
||||
} aqlprofile_buffer_desc_flags_t;
|
||||
|
||||
/**
|
||||
* @brief Callback to request a memory buffer, which will be tied to a profile.
|
||||
* The user is responsible for clearing up memory after the profile is no longer needed.
|
||||
* @param[out] ptr The pointer containing memory.
|
||||
* @param[in] size Minimum requested buffer size.
|
||||
* @param[in] flags Access flags, requesting which agents need to read/write to the buffer.
|
||||
* @param[in] userdata Data to be passed back to user.
|
||||
* @retval HSA_STATUS_SUCCESS if successful
|
||||
* @retval HSA_STATUS_ERROR if memory could not be allocated
|
||||
*/
|
||||
typedef hsa_status_t (*aqlprofile_memory_alloc_callback_t)(void** ptr,
|
||||
uint64_t size,
|
||||
aqlprofile_buffer_desc_flags_t flags,
|
||||
void* userdata);
|
||||
|
||||
/**
|
||||
* @brief Callback to dealloc memory requested via aqlprofile_memory_alloc_callback_t
|
||||
* @param[in] ptr The pointer containing memory.
|
||||
* @param[in] userdata Data to be passed back to user.
|
||||
* @retval HSA_STATUS_SUCCESS if successful
|
||||
* @retval HSA_STATUS_ERROR if memory could not be allocated
|
||||
*/
|
||||
typedef void (*aqlprofile_memory_dealloc_callback_t)(void* ptr, void* userdata);
|
||||
|
||||
typedef enum
|
||||
{
|
||||
AQLPROFILE_ACCUMULATION_NONE = 0, /** Do not accumulate event */
|
||||
AQLPROFILE_ACCUMULATION_LO_RES, /**< The event should be integrated over quad-cycles */
|
||||
AQLPROFILE_ACCUMULATION_HI_RES, /**< The event should be integrated every cycle */
|
||||
AQLPROFILE_ACCUMULATION_LAST,
|
||||
} aqlprofile_accumulation_type_t;
|
||||
|
||||
/**
|
||||
* @brief Special flags indicating additional properties to a counter. E.g. Accumulation metrics
|
||||
*/
|
||||
typedef union
|
||||
{
|
||||
uint32_t raw;
|
||||
struct
|
||||
{
|
||||
uint32_t accum : 3; /**< One of aqlprofile_accumulation_type_t */
|
||||
uint32_t _reserved : 29;
|
||||
} sq_flags;
|
||||
} aqlprofile_pmc_event_flags_t;
|
||||
|
||||
/**
|
||||
* @brief Struct containing all necessary information of an event (counter).
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint32_t block_index; /**< Block channel. */
|
||||
uint32_t event_id; /**< Event ID as fined by XML */
|
||||
aqlprofile_pmc_event_flags_t flags; /**< Special event flags e.g. accumulation */
|
||||
hsa_ven_amd_aqlprofile_block_name_t block_name; /**< Block name as defined by block indexes */
|
||||
} aqlprofile_pmc_event_t;
|
||||
|
||||
/**
|
||||
* @brief Struct containing information about the agent. User code sets these values
|
||||
* to the describe the agent to profile. Information can be obtained either from HSA
|
||||
* (if loaded) or the KFD topology.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
const char* agent_gfxip; /**< Agent GFXIP (HSA_AGENT_INFO_NAME or KFD.product_name) */
|
||||
uint32_t xcc_num; /**< XCC's on the agent (HSA_AMD_AGENT_INFO_NUM_XCC or KFD.num_xcc) */
|
||||
uint32_t se_num; /**< SE's on the agent (HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES or
|
||||
KFD.num_shader_banks) */
|
||||
uint32_t
|
||||
cu_num; /**< CU's on the agent (HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT or KFD.cu_count) */
|
||||
uint32_t shader_arrays_per_se; /**< Shader arrays per SE of agent
|
||||
(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE or
|
||||
KFD.simd_arrays_per_engine)*/
|
||||
} aqlprofile_agent_info_t;
|
||||
|
||||
/**
|
||||
* @brief Struct containing a handle to a registered agent
|
||||
*
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint64_t handle;
|
||||
} aqlprofile_agent_handle_t;
|
||||
|
||||
/**
|
||||
* @brief Registers an agent to be used with AQL profile.
|
||||
* @param[out] agent_id Handle to newly registered agent
|
||||
* @param[in] agent_info Info to register a new agent with AQL Profiler
|
||||
* @retval HSA_STATUS_SUCCESS registration ok
|
||||
* @retval HSA_STATUS_ERROR registration failed
|
||||
*/
|
||||
hsa_status_t
|
||||
aqlprofile_register_agent(aqlprofile_agent_handle_t* agent_id,
|
||||
const aqlprofile_agent_info_t* agent_info);
|
||||
|
||||
/**
|
||||
* @brief AQLprofile struct containing information for perfmon events
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
aqlprofile_agent_handle_t agent;
|
||||
const aqlprofile_pmc_event_t* events;
|
||||
uint32_t event_count;
|
||||
} aqlprofile_pmc_profile_t;
|
||||
|
||||
// Profile attributes
|
||||
typedef enum
|
||||
{
|
||||
AQLPROFILE_INFO_COMMAND_BUFFER_SIZE = 0, // get_info returns uint32_t value
|
||||
AQLPROFILE_INFO_PMC_DATA_SIZE = 1, // get_info returns uint32_t value
|
||||
AQLPROFILE_INFO_PMC_DATA = 2, // get_info returns PMC uint64_t value
|
||||
// in info_data object
|
||||
AQLPROFILE_INFO_BLOCK_COUNTERS = 4, // get_info returns number of block counter
|
||||
AQLPROFILE_INFO_BLOCK_ID = 5, // get_info returns block id, instances
|
||||
// by name string using _id_query_t
|
||||
AQLPROFILE_INFO_ENABLE_CMD = 6, // get_info returns size/pointer for
|
||||
// counters enable command buffer
|
||||
AQLPROFILE_INFO_DISABLE_CMD = 7, // get_info returns size/pointer for
|
||||
// counters disable command buffer
|
||||
} aqlprofile_pmc_info_type_t;
|
||||
|
||||
hsa_status_t
|
||||
aqlprofile_get_pmc_info(const aqlprofile_pmc_profile_t* profile,
|
||||
aqlprofile_pmc_info_type_t attribute,
|
||||
void* value);
|
||||
|
||||
/**
|
||||
* @brief AQLprofile struct containing information for Advanced Thread Trace
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
hsa_agent_t agent;
|
||||
const hsa_ven_amd_aqlprofile_parameter_t* parameters;
|
||||
uint32_t parameter_count;
|
||||
} aqlprofile_att_profile_t;
|
||||
|
||||
/**
|
||||
* @brief Data callback for perfmon events. Each event will call this once per coordinate
|
||||
* @param[in] event The event information passed in from aqlprofile_pmc_profile_t
|
||||
* @param[in] counter_id Internal ID of the counter
|
||||
* @param[in] counter_value The event value, as incremented from start() to stop()
|
||||
* @param[in] userdata Data returned to user
|
||||
* @retval HSA_STATUS_SUCCESS to continue iteration
|
||||
* @retval HSA_STATUS_ERROR to stop callback iteration
|
||||
*/
|
||||
typedef hsa_status_t (*aqlprofile_pmc_data_callback_t)(aqlprofile_pmc_event_t event,
|
||||
uint64_t counter_id,
|
||||
uint64_t counter_value,
|
||||
void* userdata);
|
||||
|
||||
/**
|
||||
* @brief Data callback for thread trace. This will be called at least once per shader engine
|
||||
* @param[in] shader Shader Engine ID
|
||||
* @param[in] buffer Pointer containing the data
|
||||
* @param[in] size Amount of bytes used by thread trace
|
||||
* @param[in] callback_data Data returned to user
|
||||
* @retval HSA_STATUS_SUCCESS to continue iteration
|
||||
* @retval HSA_STATUS_ERROR to stop callback iteration
|
||||
*/
|
||||
typedef hsa_status_t (*aqlprofile_att_data_callback_t)(uint32_t shader,
|
||||
void* buffer,
|
||||
uint64_t size,
|
||||
void* callback_data);
|
||||
|
||||
/**
|
||||
* @brief Memory copy fn for aqlprofile to copy data.
|
||||
* @param[in] dst Destination pointer to copy data to.
|
||||
* @param[in] src Source pointer where data is to be copied from.
|
||||
* @param[in] size Amount of bytes to be copied.
|
||||
* @param[in] userdata Data returned to user
|
||||
* @retval HSA_STATUS_SUCCESS on success
|
||||
* @retval HSA_STATUS_ERROR on failure
|
||||
*/
|
||||
typedef hsa_status_t (*aqlprofile_memory_copy_t)(void* dst,
|
||||
const void* src,
|
||||
size_t size,
|
||||
void* userdata);
|
||||
|
||||
/**
|
||||
* @brief Validates the event for the agent.
|
||||
* @param[in] agent The agent to validate the event for.
|
||||
* @param[in] event The event to validate.
|
||||
* @param[out] result True if the event is valid for the agent, false otherwise.
|
||||
* @retval HSA_STATUS_SUCCESS if the event was validated.
|
||||
* @retval HSA_STATUS_ERROR if the event was not validated.
|
||||
*/
|
||||
hsa_status_t
|
||||
aqlprofile_validate_pmc_event(aqlprofile_agent_handle_t agent,
|
||||
const aqlprofile_pmc_event_t* event,
|
||||
bool* result);
|
||||
|
||||
/**
|
||||
* @brief Iterate_data() will parse the event data and call @callback with the resulting event data
|
||||
* @param[in] handle The handle returned from aqlprofile_pmc_create_packets()
|
||||
* @param[in] callback CB where the resulting event values are going to be returned
|
||||
* @param[in] userdata Data sent back to user
|
||||
* @retval HSA_STATUS_SUCCESS all operations exited succesfully
|
||||
* @retval HSA_STATUS_ERROR if some callback returns an error
|
||||
* @retval HSA_STATUS_ERROR_INVALID_ARGUMENT if invalid handle is given
|
||||
*/
|
||||
hsa_status_t
|
||||
aqlprofile_pmc_iterate_data(aqlprofile_handle_t handle,
|
||||
aqlprofile_pmc_data_callback_t callback,
|
||||
void* userdata);
|
||||
|
||||
/**
|
||||
* @brief Struct to be returned by aqlprofile_pmc_create_packets
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
hsa_ext_amd_aql_pm4_packet_t start_packet; /**< Reset counters and start incrementing */
|
||||
hsa_ext_amd_aql_pm4_packet_t stop_packet; /**< Pause counters from incrementing */
|
||||
hsa_ext_amd_aql_pm4_packet_t read_packet; /**< Retrieve results from device */
|
||||
} aqlprofile_pmc_aql_packets_t;
|
||||
|
||||
/**
|
||||
* @brief Function to create AQL packets to be inserted into the queue.
|
||||
* @param[out] handle To be passed to iterate_data()
|
||||
* @param[out] packets Pointer to where the start, stop and read packets will be written to
|
||||
* @param[in] profile Agent and events information
|
||||
* @param[in] alloc_cb Memory allocation, which may request cpu or gpu memory for internal use
|
||||
* @param[in] dealloc_cb Function to free memory allocated by alloc_cb
|
||||
* @param[in] userdata Data passed back to user via memory alloc callback
|
||||
*/
|
||||
hsa_status_t
|
||||
aqlprofile_pmc_create_packets(aqlprofile_handle_t* handle,
|
||||
aqlprofile_pmc_aql_packets_t* packets,
|
||||
aqlprofile_pmc_profile_t profile,
|
||||
aqlprofile_memory_alloc_callback_t alloc_cb,
|
||||
aqlprofile_memory_dealloc_callback_t dealloc_cb,
|
||||
aqlprofile_memory_copy_t memcpy_cb,
|
||||
void* userdata);
|
||||
|
||||
/**
|
||||
* @brief Function to delete AQL packets after creation by aqlprofile_pmc_create_packets
|
||||
* @param[in] handle Returned by aqlprofile_pmc_create_packets()
|
||||
*/
|
||||
void
|
||||
aqlprofile_pmc_delete_packets(aqlprofile_handle_t handle);
|
||||
|
||||
/**
|
||||
* @brief Iterates over thread trace data and the data to user
|
||||
* @param[in] handle The handle returned from aqlprofile_att_create_packets()
|
||||
* @param[in] callback CB where the resulting data is going to be returned
|
||||
* @param[in] userdata Data sent back to user
|
||||
* @retval HSA_STATUS_SUCCESS all operations exited succesfully
|
||||
* @retval HSA_STATUS_ERROR if some callback returns an error
|
||||
* @retval HSA_STATUS_ERROR_INVALID_ARGUMENT if invalid handle is given
|
||||
*/
|
||||
hsa_status_t
|
||||
aqlprofile_att_iterate_data(aqlprofile_handle_t handle,
|
||||
aqlprofile_att_data_callback_t callback,
|
||||
void* userdata);
|
||||
|
||||
/**
|
||||
* @brief Struct containing AQLpackets to start and stop thread trace
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
hsa_ext_amd_aql_pm4_packet_t start_packet; /**< Packet to start thread trace */
|
||||
hsa_ext_amd_aql_pm4_packet_t stop_packet; /**< Packet to stop thread trace and flush data */
|
||||
} aqlprofile_att_control_aql_packets_t;
|
||||
|
||||
/**
|
||||
* @brief Fn to create start and stop thread trace packets
|
||||
* @param[out] handle To be passed to iterate_data()
|
||||
* @param[out] packets Packets returned by this function to start and stop thread trace
|
||||
* @param[in] profile Agent information and extra parameters for thread trace
|
||||
* @param[in] callback Memory allocation fn which may request cpu or gpu memory
|
||||
* @retval HSA_STATUS_SUCCESS if all packets created succesfully
|
||||
* @retval HSA_STATUS_ERROR otherwise
|
||||
*/
|
||||
hsa_status_t
|
||||
aqlprofile_att_create_packets(aqlprofile_handle_t* handle,
|
||||
aqlprofile_att_control_aql_packets_t* packets,
|
||||
aqlprofile_att_profile_t profile,
|
||||
aqlprofile_memory_alloc_callback_t alloc_cb,
|
||||
aqlprofile_memory_dealloc_callback_t dealloc_cb,
|
||||
aqlprofile_memory_copy_t memcpy_cb,
|
||||
void* userdata);
|
||||
|
||||
void
|
||||
aqlprofile_att_delete_packets(aqlprofile_handle_t handle);
|
||||
|
||||
/**
|
||||
* @brief Callback for iteration of all possible event coordinate IDs and coordinate names.
|
||||
* @param [in] id Integer identifying the dimension.
|
||||
* @param [in] name Name of the dimension
|
||||
* @param [in] data User data supplied to @ref aqlprofile_iterate_event_ids
|
||||
* @return hsa_status_t
|
||||
* @retval HSA_STATUS_SUCCESS Continues iteration
|
||||
* @retval OTHERS Any other HSA return values stops iteration, passing back this value through
|
||||
* @ref aqlprofile_iterate_event_ids
|
||||
@@ -22,12 +357,314 @@ typedef hsa_status_t (*aqlprofile_eventname_callback_t)(int id, const char* name
|
||||
* @brief Iterate over all possible event coordinate IDs and their names.
|
||||
* @param [in] callback Callback to use for iteration of dimensions
|
||||
* @param [in] user_data Data to supply to callback @ref aqlprofile_eventname_callback_t
|
||||
* @return hsa_status_t
|
||||
* @retval HSA_STATUS_SUCCESS if successful
|
||||
* @retval HSA_STATUS_ERROR if error on interation
|
||||
* @retval OTHERS If @ref aqlprofile_eventname_callback_t returns non-HSA_STATUS_SUCCESS,
|
||||
* that value is returned.
|
||||
*/
|
||||
PUBLIC_API hsa_status_t
|
||||
hsa_status_t
|
||||
aqlprofile_iterate_event_ids(aqlprofile_eventname_callback_t callback, void* user_data);
|
||||
|
||||
/**
|
||||
* @brief Iterate over all event coordinates for a given agent_t and event_t.
|
||||
* @param position A counting sequence indicating callback number.
|
||||
* @param id Coordinate ID as in _iterate_event_ids.
|
||||
* @param extent Coordinate extent indicating maximum allowed instances.
|
||||
* @param coordinate The coordinate, in the range [0,extent-1].
|
||||
* @param name Coordinate name as in _iterate_event_ids.
|
||||
* @param userdata Userdata returned from _iterate_event_coord function.
|
||||
*/
|
||||
typedef hsa_status_t (*aqlprofile_coordinate_callback_t)(int position,
|
||||
int id,
|
||||
int extent,
|
||||
int coordinate,
|
||||
const char* name,
|
||||
void* userdata);
|
||||
|
||||
/**
|
||||
* @brief Iterate over all event coordinates for a given agent_t and event_t.
|
||||
* @param[in] agent HSA agent.
|
||||
* @param[in] event The event ID and block ID to iterate for.
|
||||
* @param[in] sample_id aqlprofile_info_data_t.sample_id returned from _aqlprofile_iterate_data.
|
||||
* @param[in] callback Callback function to return the coordinates.
|
||||
* @param[in] userdata Arbitrary data pointer to be sent back to the user via callback.
|
||||
*/
|
||||
hsa_status_t
|
||||
aqlprofile_iterate_event_coord(aqlprofile_agent_handle_t agent,
|
||||
aqlprofile_pmc_event_t event,
|
||||
uint64_t sample_id,
|
||||
aqlprofile_coordinate_callback_t callback,
|
||||
void* userdata);
|
||||
|
||||
typedef union
|
||||
{
|
||||
uint64_t raw;
|
||||
struct
|
||||
{
|
||||
uint64_t isValid : 1;
|
||||
uint64_t isNavi : 1;
|
||||
uint64_t npiWaveData : 1;
|
||||
uint64_t version : 13;
|
||||
};
|
||||
} att_output_flags_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int64_t time;
|
||||
uint16_t events0;
|
||||
uint16_t events1;
|
||||
uint16_t events2;
|
||||
uint16_t events3;
|
||||
uint8_t CU;
|
||||
uint8_t bank;
|
||||
} att_perfevent_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint64_t kernel_id : 12;
|
||||
uint64_t simd : 2;
|
||||
uint64_t slot : 4;
|
||||
uint64_t enable : 1;
|
||||
uint64_t cu : 4;
|
||||
uint64_t time : 41; // Time_value/8
|
||||
} att_occupancy_info_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int32_t type;
|
||||
int32_t duration;
|
||||
} wave_state_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int64_t time;
|
||||
int64_t duration;
|
||||
} wave_instruction_t;
|
||||
|
||||
enum WAVESLOT_STATE
|
||||
{
|
||||
WS_EMPTY = 0,
|
||||
WS_IDLE = 1,
|
||||
WS_EXEC = 2,
|
||||
WS_WAIT = 3,
|
||||
WS_STALL = 4,
|
||||
WS_UNKNOWN = 5,
|
||||
};
|
||||
|
||||
enum WaveInstCategory
|
||||
{
|
||||
NONE = 0,
|
||||
SMEM = 1,
|
||||
SALU = 2,
|
||||
VMEM = 3,
|
||||
FLAT = 4,
|
||||
LDS = 5,
|
||||
VALU = 6,
|
||||
JUMP = 7,
|
||||
NEXT = 8,
|
||||
IMMED = 9,
|
||||
TRAP = 10,
|
||||
PCINFO = 15,
|
||||
WAVE_NOT_FINISHED,
|
||||
};
|
||||
|
||||
enum WaveTrapStatus
|
||||
{
|
||||
TRAP_RESTORED = 0,
|
||||
TRAP_REQUEST = 1,
|
||||
TRAP_SAVED = 1,
|
||||
TRAP_STANDBY = 2
|
||||
};
|
||||
|
||||
typedef struct
|
||||
{
|
||||
size_t addr;
|
||||
size_t marker_id;
|
||||
} pcinfo_t;
|
||||
|
||||
typedef struct __attribute__((packed))
|
||||
{
|
||||
uint64_t category : 8;
|
||||
uint64_t hitcount : 56;
|
||||
uint64_t latency;
|
||||
pcinfo_t pc;
|
||||
} att_trace_event_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint8_t simd;
|
||||
uint8_t wave_id;
|
||||
uint8_t trap_status;
|
||||
uint8_t reserved;
|
||||
|
||||
// VMEM Pipeline: instrs and stalls
|
||||
int num_vmem_instrs;
|
||||
int num_vmem_stalls;
|
||||
// FLAT instrs and stalls
|
||||
int num_flat_instrs;
|
||||
int num_flat_stalls;
|
||||
|
||||
// LDS instr and stalls
|
||||
int num_lds_instrs;
|
||||
int num_lds_stalls;
|
||||
|
||||
// SCA instrs stalls
|
||||
int num_salu_instrs;
|
||||
int num_smem_instrs;
|
||||
int num_salu_stalls;
|
||||
int num_smem_stalls;
|
||||
|
||||
// Branch
|
||||
int num_branch_instrs;
|
||||
int num_branch_taken_instrs;
|
||||
int num_branch_stalls;
|
||||
|
||||
// total VMEM/FLAT/LDS/SMEM instructions issued
|
||||
int num_mem_instrs; // total issued memory instructions
|
||||
int num_valu_stalls;
|
||||
size_t num_valu_instrs;
|
||||
size_t num_issued_instrs; // total issued instructions (compute + memory)
|
||||
|
||||
int64_t begin_time; // Begin and end cycle
|
||||
int64_t end_time;
|
||||
int64_t traceID;
|
||||
|
||||
size_t timeline_size;
|
||||
size_t instructions_size;
|
||||
wave_state_t* timeline_array;
|
||||
wave_instruction_t* instructions_array;
|
||||
} wave_data_t;
|
||||
|
||||
/**
|
||||
* @brief Callback for rocprofiler to return ISA to aqlprofile ATT parser.
|
||||
* The caller must copy a desired instruction on isa_instruction and source_reference,
|
||||
* while obeying the max length passed by the caller.
|
||||
* If the caller's length is insufficient, then this function writes the minimum sizes to isa_size
|
||||
* and source_size and returns HSA_STATUS_ERROR_OUT_OF_RESOURCES.
|
||||
* If call returns HSA_STATUS_SUCCESS, isa_size and source_size are written with bytes used.
|
||||
* @param[out] isa_instruction Where to copy the ISA line to.
|
||||
* @param[out] source_reference Reference to source line and/or additional comments in the binary.
|
||||
* @param[out] isa_memory_size (Auto) The number of bytes to next instruction. 0 for custom ISA.
|
||||
* @param[inout] isa_size Size of returned ISA string.
|
||||
* @param[inout] source_size Size of returned reference/comment string.
|
||||
* @param[in] marker_id The generated ATT marker for given codeobject ID.
|
||||
* @param[in] offset The offset from base vaddr for given codeobj ID.
|
||||
* If marker_id == 0, this parameter is raw virtual address with no codeobj ID information.
|
||||
* @param[in] userdata Arbitrary data pointer to be sent back to the user via callback.
|
||||
* @retval HSA_STATUS_SUCCESS on success.
|
||||
* @retval HSA_STATUS_ERROR on generic error.
|
||||
* @retval HSA_STATUS_ERROR_INVALID_ARGUMENT for invalid offset or invalid marker_id.
|
||||
* @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES for insufficient isa_size or source_size.
|
||||
*/
|
||||
typedef hsa_status_t (*aqlprofile_att_isa_callback_t)(char* isa_instruction,
|
||||
char* source_reference,
|
||||
uint64_t* isa_memory_size,
|
||||
uint64_t* isa_size,
|
||||
uint64_t* source_size,
|
||||
uint64_t marker_id,
|
||||
uint64_t offset,
|
||||
void* userdata);
|
||||
|
||||
/**
|
||||
* @brief Callback for rocprofiler to return traces back to rocprofiler.
|
||||
* @param[in] trace_type_id The type of this trace as in _iterate_event_ids().
|
||||
* @param[in] correlation_id The ID of shader engine or trace callback number.
|
||||
* @param[in] trace_events A pointer to sequence of events, of size trace_size.
|
||||
* @param[in] trace_size The number of events in the trace.
|
||||
* @param[in] userdata Arbitrary data pointer to be sent back to the user via callback.
|
||||
*/
|
||||
typedef hsa_status_t (*aqlprofile_att_trace_callback_t)(int trace_type_id,
|
||||
int correlation_id,
|
||||
void* trace_events,
|
||||
uint64_t trace_size,
|
||||
void* userdata);
|
||||
|
||||
/**
|
||||
* @brief Callback for the ATT parser to retrieve Shader Engine data.
|
||||
* Returns the amount of data filled. If no more data is available, then callback return 0
|
||||
* If the space available in the buffer is less than required for parsing the full data,
|
||||
* the full data is transfered over multiple calls.
|
||||
* When all data has been transfered from current shader_engine_id, the caller has the option to
|
||||
* 1) Return -1 on shader_engine ID and parsing terminates
|
||||
* 2) Move to the next shader engine.
|
||||
* @param[out] shader_engine_id The ID of given shader engine.
|
||||
* @param[out] buffer The buffer to fill up with SE data.
|
||||
* @param[out] buffer_size The space available in the buffer.
|
||||
* @param[in] userdata Arbitrary data pointer to be sent back to the user via callback.
|
||||
* @returns Number of bytes remaining in shader engine.
|
||||
* @retval 0 if no more SE data is available. Parsing will stop.
|
||||
* @retval buffer_size if the buffer does not hold enough data for the current shader engine.
|
||||
* @retval 0 > ret > buffer_size for partially filled buffer, and caller moves over to next SE.
|
||||
*/
|
||||
typedef uint64_t (*aqlprofile_att_se_data_callback_t)(int* shader_engine_id,
|
||||
uint8_t** buffer,
|
||||
uint64_t* buffer_size,
|
||||
void* userdata);
|
||||
|
||||
/**
|
||||
* @brief Callback returning from aqlprofile_att_parser_iterate_event_list
|
||||
* @param[in] trace_event_id ID of the event.
|
||||
* @param[in] trace_event_metadata Null-terminated string, entries separated by ';'
|
||||
* @param[in] userdata userdata.
|
||||
*/
|
||||
typedef void (*aqlprofile_att_parser_iterate_event_cb_t)(int trace_event_id,
|
||||
const char* trace_event_metadata,
|
||||
void* userdata);
|
||||
|
||||
/**
|
||||
* @brief Iterate over all available event types.
|
||||
* @param[in] callback Callback where events are returned to.
|
||||
* @param[in] userdata userdata.
|
||||
*/
|
||||
void
|
||||
aqlprofile_att_parser_iterate_event_list(aqlprofile_att_parser_iterate_event_cb_t callback,
|
||||
void* userdata);
|
||||
|
||||
/**
|
||||
* @brief Iterate over all event coordinates for a given agent_t and event_t.
|
||||
* @param[in] se_data_callback Callback to return shader engine data from.
|
||||
* @param[in] trace_callback Callback where the trace data is returned to.
|
||||
* Each trace will be marked by the ID returned on aqlprofile_att_parser_iterate_event_list.
|
||||
* @param[in] isa_callback Callback to return ISA lines.
|
||||
* @param[in] userdata Userdata passed back to caller via callback.
|
||||
*/
|
||||
hsa_status_t
|
||||
aqlprofile_att_parse_data(aqlprofile_att_se_data_callback_t se_data_callback,
|
||||
aqlprofile_att_trace_callback_t trace_callback,
|
||||
aqlprofile_att_isa_callback_t isa_callback,
|
||||
void* userdata);
|
||||
|
||||
/**
|
||||
* @brief Contains flags for how code objects are interpreted
|
||||
*/
|
||||
typedef union
|
||||
{
|
||||
struct
|
||||
{
|
||||
uint32_t isUnload : 1; // 0 if code object is being loaded, 1 for unload
|
||||
uint32_t bFromStart : 1; // Has this code object been loaded before thread trace started?
|
||||
uint32_t legacy_id : 30; // Legacy code object ID, if it fits in 30 bits.
|
||||
};
|
||||
uint32_t raw;
|
||||
} aqlprofile_att_header_marker_t;
|
||||
|
||||
/**
|
||||
* @brief Creates an AQL packet for marking code objects
|
||||
* @param[out] packets Returned packet
|
||||
* @param[in] handle The handle created from aqlprofile_att_create_packets()
|
||||
* @param[in] header Header containing code object information created from profiler
|
||||
* @param[in] id To be passed back to isa_string_callback in marker_id
|
||||
* @param[in] addr Code object loaded address.
|
||||
* @param[in] size Code object loaded size.
|
||||
*/
|
||||
hsa_status_t
|
||||
aqlprofile_att_codeobj_load_marker(hsa_ext_amd_aql_pm4_packet_t* packets,
|
||||
aqlprofile_handle_t handle,
|
||||
aqlprofile_att_header_marker_t header,
|
||||
uint64_t id,
|
||||
uint64_t addr,
|
||||
uint64_t size);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -26,12 +26,22 @@
|
||||
#include <hsa/hsa_ext_amd.h>
|
||||
#include "glog/logging.h"
|
||||
|
||||
#define CHECK_HSA(fn, message) \
|
||||
{ \
|
||||
auto status = (fn); \
|
||||
if(status != HSA_STATUS_SUCCESS) \
|
||||
{ \
|
||||
std::cerr << "HSA Err: " << status << "\n"; \
|
||||
exit(1); \
|
||||
} \
|
||||
}
|
||||
|
||||
namespace rocprofiler
|
||||
{
|
||||
namespace aql
|
||||
{
|
||||
AQLPacketConstruct::AQLPacketConstruct(const hsa::AgentCache& agent,
|
||||
const std::vector<counters::Metric>& metrics)
|
||||
CounterPacketConstruct::CounterPacketConstruct(const hsa::AgentCache& agent,
|
||||
const std::vector<counters::Metric>& metrics)
|
||||
: _agent(agent)
|
||||
{
|
||||
// Validate that the counter exists and construct the block instances
|
||||
@@ -67,12 +77,11 @@ AQLPacketConstruct::AQLPacketConstruct(const hsa::AgentCache& agen
|
||||
_events = get_all_events();
|
||||
}
|
||||
|
||||
std::unique_ptr<hsa::AQLPacket>
|
||||
AQLPacketConstruct::construct_packet(const AmdExtTable& ext) const
|
||||
std::unique_ptr<hsa::CounterAQLPacket>
|
||||
CounterPacketConstruct::construct_packet(const AmdExtTable& ext)
|
||||
{
|
||||
const size_t MEM_PAGE_MASK = 0x1000 - 1;
|
||||
auto pkt_ptr = std::make_unique<hsa::AQLPacket>(ext.hsa_amd_memory_pool_free_fn);
|
||||
auto& pkt = *pkt_ptr;
|
||||
auto pkt_ptr = std::make_unique<hsa::CounterAQLPacket>(ext.hsa_amd_memory_pool_free_fn);
|
||||
auto& pkt = *pkt_ptr;
|
||||
if(_events.empty())
|
||||
{
|
||||
return pkt_ptr;
|
||||
@@ -104,15 +113,7 @@ AQLPacketConstruct::construct_packet(const AmdExtTable& ext) const
|
||||
_agent.get_hsa_agent().handle));
|
||||
}
|
||||
|
||||
auto throw_if_failed = [](auto status, auto& message) {
|
||||
if(status != HSA_STATUS_SUCCESS)
|
||||
{
|
||||
throw std::runtime_error(message);
|
||||
}
|
||||
};
|
||||
|
||||
throw_if_failed(hsa_ven_amd_aqlprofile_start(&profile, nullptr),
|
||||
"could not generate packet sizes");
|
||||
CHECK_HSA(hsa_ven_amd_aqlprofile_start(&profile, nullptr), "could not generate packet sizes");
|
||||
|
||||
if(profile.command_buffer.size == 0 || profile.output_buffer.size == 0)
|
||||
{
|
||||
@@ -125,7 +126,7 @@ AQLPacketConstruct::construct_packet(const AmdExtTable& ext) const
|
||||
// Allocate buffers and check the results
|
||||
auto alloc_and_check = [&](auto& pool, auto** mem_loc, auto size) -> bool {
|
||||
bool malloced = false;
|
||||
size_t page_aligned = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
|
||||
size_t page_aligned = getPageAligned(size);
|
||||
if(ext.hsa_amd_memory_pool_allocate_fn(
|
||||
pool, page_aligned, 0, static_cast<void**>(mem_loc)) != HSA_STATUS_SUCCESS)
|
||||
{
|
||||
@@ -153,22 +154,74 @@ AQLPacketConstruct::construct_packet(const AmdExtTable& ext) const
|
||||
_agent.kernarg_pool(), &profile.output_buffer.ptr, profile.output_buffer.size);
|
||||
memset(profile.output_buffer.ptr, 0x0, profile.output_buffer.size);
|
||||
|
||||
// throw if we do not construct the packets correctly.
|
||||
throw_if_failed(hsa_ven_amd_aqlprofile_start(&profile, &pkt.start),
|
||||
"could not generate start packet");
|
||||
throw_if_failed(hsa_ven_amd_aqlprofile_stop(&profile, &pkt.stop),
|
||||
"could not generate stop packet");
|
||||
throw_if_failed(hsa_ven_amd_aqlprofile_read(&profile, &pkt.read),
|
||||
"could not generate read packet");
|
||||
CHECK_HSA(hsa_ven_amd_aqlprofile_start(&profile, &pkt.start), "failed to create start packet");
|
||||
CHECK_HSA(hsa_ven_amd_aqlprofile_stop(&profile, &pkt.stop), "failed to create stop packet");
|
||||
CHECK_HSA(hsa_ven_amd_aqlprofile_read(&profile, &pkt.read), "failed to create read packet");
|
||||
pkt.start.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE;
|
||||
pkt.stop.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE;
|
||||
pkt.read.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE;
|
||||
|
||||
return pkt_ptr;
|
||||
}
|
||||
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wnarrowing"
|
||||
|
||||
ThreadTraceAQLPacketFactory::ThreadTraceAQLPacketFactory(
|
||||
const hsa::AgentCache& agent,
|
||||
std::shared_ptr<thread_trace_parameters>& params,
|
||||
const CoreApiTable& coreapi,
|
||||
const AmdExtTable& ext)
|
||||
{
|
||||
this->tracepool = std::make_shared<hsa::TraceMemoryPool>();
|
||||
this->tracepool->allocate_fn = ext.hsa_amd_memory_pool_allocate_fn;
|
||||
this->tracepool->allow_access_fn = ext.hsa_amd_agents_allow_access_fn;
|
||||
this->tracepool->free_fn = ext.hsa_amd_memory_pool_free_fn;
|
||||
this->tracepool->api_copy_fn = coreapi.hsa_memory_copy_fn;
|
||||
this->tracepool->gpu_agent = agent.get_hsa_agent();
|
||||
this->tracepool->cpu_pool_ = agent.cpu_pool();
|
||||
this->tracepool->gpu_pool_ = agent.gpu_pool();
|
||||
|
||||
this->aql_params.clear();
|
||||
auto& p = this->aql_params;
|
||||
p.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET, params->target_cu});
|
||||
p.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK, params->shader_engine_mask});
|
||||
p.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SIMD_SELECTION, params->simd_select});
|
||||
p.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE, params->buffer_size});
|
||||
|
||||
this->profile = aqlprofile_att_profile_t{agent.get_hsa_agent(), p.data(), p.size()};
|
||||
}
|
||||
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
std::unique_ptr<hsa::TraceAQLPacket>
|
||||
ThreadTraceAQLPacketFactory::construct_packet()
|
||||
{
|
||||
auto packet = std::make_unique<hsa::TraceAQLPacket>(this->tracepool);
|
||||
/*hsa_status_t _status = aqlprofile_att_create_packets(&packet->handle,
|
||||
&packet->packets,
|
||||
this->profile,
|
||||
&hsa::TraceAQLPacket::Alloc,
|
||||
&hsa::TraceAQLPacket::Free,
|
||||
&hsa::TraceAQLPacket::Copy,
|
||||
packet.get());
|
||||
CHECK_HSA(_status, "failed to create ATT packet");*/
|
||||
|
||||
packet->before_krn_pkt.clear();
|
||||
packet->after_krn_pkt.clear();
|
||||
packet->packets.start_packet.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE;
|
||||
packet->packets.stop_packet.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE;
|
||||
|
||||
packet->empty = false;
|
||||
packet->start = packet->packets.start_packet;
|
||||
packet->stop = packet->packets.stop_packet;
|
||||
packet->before_krn_pkt.push_back(packet->start);
|
||||
packet->after_krn_pkt.push_back(packet->stop);
|
||||
|
||||
return packet;
|
||||
}
|
||||
|
||||
std::vector<hsa_ven_amd_aqlprofile_event_t>
|
||||
AQLPacketConstruct::get_all_events() const
|
||||
CounterPacketConstruct::get_all_events() const
|
||||
{
|
||||
std::vector<hsa_ven_amd_aqlprofile_event_t> ret;
|
||||
for(const auto& metric : _metrics)
|
||||
@@ -179,7 +232,7 @@ AQLPacketConstruct::get_all_events() const
|
||||
}
|
||||
|
||||
const counters::Metric*
|
||||
AQLPacketConstruct::event_to_metric(const hsa_ven_amd_aqlprofile_event_t& event) const
|
||||
CounterPacketConstruct::event_to_metric(const hsa_ven_amd_aqlprofile_event_t& event) const
|
||||
{
|
||||
if(const auto* ptr = rocprofiler::common::get_val(
|
||||
_event_to_metric,
|
||||
@@ -191,7 +244,7 @@ AQLPacketConstruct::event_to_metric(const hsa_ven_amd_aqlprofile_event_t& event)
|
||||
}
|
||||
|
||||
const std::vector<hsa_ven_amd_aqlprofile_event_t>&
|
||||
AQLPacketConstruct::get_counter_events(const counters::Metric& metric) const
|
||||
CounterPacketConstruct::get_counter_events(const counters::Metric& metric) const
|
||||
{
|
||||
for(const auto& prof_metric : _metrics)
|
||||
{
|
||||
@@ -204,7 +257,7 @@ AQLPacketConstruct::get_counter_events(const counters::Metric& metric) const
|
||||
}
|
||||
|
||||
void
|
||||
AQLPacketConstruct::can_collect()
|
||||
CounterPacketConstruct::can_collect()
|
||||
{
|
||||
// Verify that the counters fit within harrdware limits
|
||||
std::map<std::pair<hsa_ven_amd_aqlprofile_block_name_t, uint32_t>, int64_t> counter_count;
|
||||
|
||||
@@ -30,10 +30,12 @@
|
||||
#include <hsa/hsa_api_trace.h>
|
||||
#include <hsa/hsa_ven_amd_aqlprofile.h>
|
||||
|
||||
#include "lib/rocprofiler-sdk/aql/aql_profile_v2.h"
|
||||
#include "lib/rocprofiler-sdk/aql/helpers.hpp"
|
||||
#include "lib/rocprofiler-sdk/counters/metrics.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/agent_cache.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/queue.hpp"
|
||||
#include "lib/rocprofiler-sdk/thread_trace/att_core.hpp"
|
||||
|
||||
namespace rocprofiler
|
||||
{
|
||||
@@ -47,21 +49,27 @@ namespace aql
|
||||
* consturcted start/stop/read packets along with allocated buffers needed
|
||||
* to collect the counter data.
|
||||
*/
|
||||
class AQLPacketConstruct
|
||||
class CounterPacketConstruct
|
||||
{
|
||||
public:
|
||||
AQLPacketConstruct(const hsa::AgentCache& agent, const std::vector<counters::Metric>& metrics);
|
||||
std::unique_ptr<hsa::AQLPacket> construct_packet(const AmdExtTable&) const;
|
||||
CounterPacketConstruct(const hsa::AgentCache& agent,
|
||||
const std::vector<counters::Metric>& metrics);
|
||||
std::unique_ptr<hsa::CounterAQLPacket> construct_packet(const AmdExtTable&);
|
||||
|
||||
const counters::Metric* event_to_metric(const hsa_ven_amd_aqlprofile_event_t& event) const;
|
||||
std::vector<hsa_ven_amd_aqlprofile_event_t> get_all_events() const;
|
||||
hsa_agent_t hsa_agent() const { return _agent.get_hsa_agent(); }
|
||||
|
||||
const std::vector<hsa_ven_amd_aqlprofile_event_t>& get_counter_events(
|
||||
const counters::Metric&) const;
|
||||
|
||||
hsa_agent_t hsa_agent() const { return _agent.get_hsa_agent(); }
|
||||
|
||||
private:
|
||||
const hsa::AgentCache& _agent;
|
||||
static constexpr size_t MEM_PAGE_ALIGN = 0x1000;
|
||||
static constexpr size_t MEM_PAGE_MASK = MEM_PAGE_ALIGN - 1;
|
||||
static size_t getPageAligned(size_t p) { return (p + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; }
|
||||
|
||||
protected:
|
||||
struct AQLProfileMetric
|
||||
{
|
||||
counters::Metric metric;
|
||||
@@ -70,12 +78,26 @@ private:
|
||||
|
||||
void can_collect();
|
||||
|
||||
const hsa::AgentCache& _agent;
|
||||
std::vector<AQLProfileMetric> _metrics;
|
||||
std::vector<hsa_ven_amd_aqlprofile_event_t> _events;
|
||||
std::map<std::tuple<hsa_ven_amd_aqlprofile_block_name_t, uint32_t, uint32_t>, counters::Metric>
|
||||
_event_to_metric;
|
||||
};
|
||||
|
||||
class ThreadTraceAQLPacketFactory
|
||||
{
|
||||
public:
|
||||
ThreadTraceAQLPacketFactory(const hsa::AgentCache& agent,
|
||||
std::shared_ptr<thread_trace_parameters>& params,
|
||||
const CoreApiTable& coreapi,
|
||||
const AmdExtTable& ext);
|
||||
std::unique_ptr<hsa::TraceAQLPacket> construct_packet();
|
||||
|
||||
private:
|
||||
std::shared_ptr<hsa::TraceMemoryPool> tracepool;
|
||||
std::vector<hsa_ven_amd_aqlprofile_parameter_t> aql_params;
|
||||
aqlprofile_att_profile_t profile;
|
||||
};
|
||||
|
||||
} // namespace aql
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -122,7 +122,7 @@ TEST(aql_profile, construct_packets)
|
||||
LOG(WARNING) << fmt::format("Found Agent: {}", agent.get_hsa_agent().handle);
|
||||
auto metrics = rocprofiler::findDeviceMetrics(agent, {"SQ_WAVES"});
|
||||
ASSERT_EQ(metrics.size(), 1);
|
||||
AQLPacketConstruct(agent, metrics);
|
||||
CounterPacketConstruct(agent, metrics);
|
||||
}
|
||||
hsa_shut_down();
|
||||
}
|
||||
@@ -142,7 +142,7 @@ TEST(aql_profile, too_many_counters)
|
||||
{
|
||||
try
|
||||
{
|
||||
AQLPacketConstruct(agent, metrics);
|
||||
CounterPacketConstruct(agent, metrics);
|
||||
} catch(const std::exception& e)
|
||||
{
|
||||
EXPECT_NE(e.what(), nullptr) << e.what();
|
||||
@@ -163,9 +163,9 @@ TEST(aql_profile, packet_generation_single)
|
||||
ASSERT_GT(agents.size(), 0);
|
||||
for(const auto& [_, agent] : agents)
|
||||
{
|
||||
auto metrics = rocprofiler::findDeviceMetrics(agent, {"SQ_WAVES"});
|
||||
AQLPacketConstruct pkt(agent, metrics);
|
||||
auto test_pkt = pkt.construct_packet(rocprofiler::get_ext_table());
|
||||
auto metrics = rocprofiler::findDeviceMetrics(agent, {"SQ_WAVES"});
|
||||
CounterPacketConstruct pkt(agent, metrics);
|
||||
auto test_pkt = pkt.construct_packet(rocprofiler::get_ext_table());
|
||||
EXPECT_TRUE(test_pkt);
|
||||
}
|
||||
|
||||
@@ -183,14 +183,30 @@ TEST(aql_profile, packet_generation_multi)
|
||||
{
|
||||
auto metrics =
|
||||
rocprofiler::findDeviceMetrics(agent, {"SQ_WAVES", "TA_FLAT_READ_WAVEFRONTS"});
|
||||
AQLPacketConstruct pkt(agent, metrics);
|
||||
auto test_pkt = pkt.construct_packet(rocprofiler::get_ext_table());
|
||||
CounterPacketConstruct pkt(agent, metrics);
|
||||
auto test_pkt = pkt.construct_packet(rocprofiler::get_ext_table());
|
||||
EXPECT_TRUE(test_pkt);
|
||||
}
|
||||
|
||||
hsa_shut_down();
|
||||
}
|
||||
|
||||
class TestAqlPacket : public rocprofiler::hsa::CounterAQLPacket
|
||||
{
|
||||
public:
|
||||
TestAqlPacket(bool mallocd)
|
||||
: rocprofiler::hsa::CounterAQLPacket([](void* x) -> hsa_status_t {
|
||||
::free(x);
|
||||
return HSA_STATUS_SUCCESS;
|
||||
})
|
||||
{
|
||||
this->profile.output_buffer.ptr = malloc(sizeof(double));
|
||||
this->profile.command_buffer.ptr = malloc(sizeof(double));
|
||||
this->command_buf_mallocd = mallocd;
|
||||
this->output_buffer_malloced = mallocd;
|
||||
}
|
||||
};
|
||||
|
||||
TEST(aql_profile, test_aql_packet)
|
||||
{
|
||||
auto check_null = [](auto& val) {
|
||||
@@ -201,25 +217,12 @@ TEST(aql_profile, test_aql_packet)
|
||||
val.completion_signal.handle == null_val.completion_signal.handle;
|
||||
};
|
||||
|
||||
rocprofiler::hsa::AQLPacket test_pkt([](void* x) -> hsa_status_t {
|
||||
::free(x);
|
||||
return HSA_STATUS_SUCCESS;
|
||||
});
|
||||
TestAqlPacket test_pkt(true);
|
||||
EXPECT_TRUE(check_null(test_pkt.start)) << "Start packet not null";
|
||||
EXPECT_TRUE(check_null(test_pkt.stop)) << "Stop packet not null";
|
||||
EXPECT_TRUE(check_null(test_pkt.read)) << "Read packet not null";
|
||||
|
||||
// If this leaks, then AQLPacket is not freeing data correctly.
|
||||
test_pkt.profile.output_buffer.ptr = malloc(sizeof(double));
|
||||
test_pkt.profile.command_buffer.ptr = malloc(sizeof(double));
|
||||
test_pkt.command_buf_mallocd = true;
|
||||
test_pkt.output_buffer_malloced = true;
|
||||
|
||||
// test custom destructor as well
|
||||
rocprofiler::hsa::AQLPacket test_pkt2([](void* x) -> hsa_status_t {
|
||||
::free(x);
|
||||
return HSA_STATUS_SUCCESS;
|
||||
});
|
||||
test_pkt2.profile.output_buffer.ptr = malloc(sizeof(double));
|
||||
test_pkt2.profile.command_buffer.ptr = malloc(sizeof(double));
|
||||
// Why is this valid?
|
||||
TestAqlPacket test_pkt2(false);
|
||||
}
|
||||
|
||||
@@ -32,6 +32,7 @@
|
||||
#include "lib/rocprofiler-sdk/buffer.hpp"
|
||||
#include "lib/rocprofiler-sdk/context/context.hpp"
|
||||
#include "lib/rocprofiler-sdk/counters/core.hpp"
|
||||
#include "lib/rocprofiler-sdk/thread_trace/att_core.hpp"
|
||||
|
||||
#include <glog/logging.h>
|
||||
|
||||
@@ -446,6 +447,7 @@ start_context(rocprofiler_context_id_t context_id)
|
||||
}
|
||||
|
||||
if(cfg->counter_collection) rocprofiler::counters::start_context(cfg);
|
||||
if(cfg->thread_trace) cfg->thread_trace->start_context();
|
||||
|
||||
return ROCPROFILER_STATUS_SUCCESS;
|
||||
}
|
||||
@@ -472,6 +474,8 @@ stop_context(rocprofiler_context_id_t idx)
|
||||
|
||||
if(_expected->counter_collection)
|
||||
rocprofiler::counters::stop_context(const_cast<context*>(_expected));
|
||||
else if(_expected->thread_trace)
|
||||
_expected->thread_trace->stop_context();
|
||||
return ROCPROFILER_STATUS_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,6 +33,7 @@
|
||||
#include "lib/rocprofiler-sdk/context/domain.hpp"
|
||||
#include "lib/rocprofiler-sdk/counters/core.hpp"
|
||||
#include "lib/rocprofiler-sdk/external_correlation.hpp"
|
||||
#include "lib/rocprofiler-sdk/thread_trace/att_core.hpp"
|
||||
|
||||
#include <array>
|
||||
#include <atomic>
|
||||
@@ -153,6 +154,7 @@ struct context
|
||||
std::unique_ptr<callback_tracing_service> callback_tracer = {};
|
||||
std::unique_ptr<buffer_tracing_service> buffered_tracer = {};
|
||||
std::unique_ptr<counter_collection_service> counter_collection = {};
|
||||
std::shared_ptr<rocprofiler::ThreadTracer> thread_trace = {};
|
||||
};
|
||||
|
||||
// set the client index needs to be called before allocate_context()
|
||||
|
||||
@@ -218,7 +218,7 @@ counter_callback_info::setup_profile_config(const hsa::AgentCache& age
|
||||
}
|
||||
}
|
||||
|
||||
profile->pkt_generator = std::make_unique<rocprofiler::aql::AQLPacketConstruct>(
|
||||
profile->pkt_generator = std::make_unique<rocprofiler::aql::CounterPacketConstruct>(
|
||||
agent,
|
||||
std::vector<counters::Metric>{profile->reqired_hw_counters.begin(),
|
||||
profile->reqired_hw_counters.end()});
|
||||
@@ -288,7 +288,7 @@ queue_cb(const context::context* ctx,
|
||||
// Packet generated when no instrumentation is performed. May contain serialization
|
||||
// packets/barrier packets (and can be empty).
|
||||
auto no_instrumentation = [&]() {
|
||||
auto ret_pkt = std::make_unique<rocprofiler::hsa::AQLPacket>(nullptr);
|
||||
auto ret_pkt = std::make_unique<rocprofiler::hsa::CounterAQLPacket>(nullptr);
|
||||
// If we have a counter collection context but it is not enabled, we still might need
|
||||
// to add barrier packets to transition from serialized -> unserialized execution. This
|
||||
// transition is coordinated by the serializer.
|
||||
|
||||
@@ -59,7 +59,7 @@ struct profile_config
|
||||
std::vector<counters::EvaluateAST> asts{};
|
||||
rocprofiler_profile_config_id_t id{.handle = 0};
|
||||
// Packet generator to create AQL packets for insertion
|
||||
std::unique_ptr<rocprofiler::aql::AQLPacketConstruct> pkt_generator{nullptr};
|
||||
std::unique_ptr<rocprofiler::aql::CounterPacketConstruct> pkt_generator{nullptr};
|
||||
// A packet cache of AQL packets. This allows reuse of AQL packets (preventing costly
|
||||
// allocation of new packets/destruction).
|
||||
rocprofiler::common::Synchronized<std::vector<std::unique_ptr<rocprofiler::hsa::AQLPacket>>>
|
||||
|
||||
@@ -61,8 +61,8 @@ getBlockDimensions(std::string_view agent, const Metric& metric)
|
||||
{
|
||||
if(maybe_agent.name() == agent)
|
||||
{
|
||||
aql::AQLPacketConstruct pkt_gen(maybe_agent, {metric});
|
||||
const auto& events = pkt_gen.get_counter_events(metric);
|
||||
aql::CounterPacketConstruct pkt_gen(maybe_agent, {metric});
|
||||
const auto& events = pkt_gen.get_counter_events(metric);
|
||||
|
||||
for(const auto& event : events)
|
||||
{
|
||||
|
||||
@@ -450,16 +450,16 @@ EvaluateAST::read_special_counters(
|
||||
}
|
||||
|
||||
std::unordered_map<uint64_t, std::vector<rocprofiler_record_counter_t>>
|
||||
EvaluateAST::read_pkt(const aql::AQLPacketConstruct* pkt_gen, hsa::AQLPacket& pkt)
|
||||
EvaluateAST::read_pkt(const aql::CounterPacketConstruct* pkt_gen, hsa::AQLPacket& pkt)
|
||||
{
|
||||
struct it_data
|
||||
{
|
||||
std::unordered_map<uint64_t, std::vector<rocprofiler_record_counter_t>>* data;
|
||||
const aql::AQLPacketConstruct* pkt_gen;
|
||||
const aql::CounterPacketConstruct* pkt_gen;
|
||||
};
|
||||
|
||||
std::unordered_map<uint64_t, std::vector<rocprofiler_record_counter_t>> ret;
|
||||
if(pkt.empty) return ret;
|
||||
if(pkt.isEmpty()) return ret;
|
||||
it_data aql_data{.data = &ret, .pkt_gen = pkt_gen};
|
||||
;
|
||||
hsa_status_t status = hsa_ven_amd_aqlprofile_iterate_data(
|
||||
|
||||
@@ -136,8 +136,8 @@ public:
|
||||
*
|
||||
*/
|
||||
static std::unordered_map<uint64_t, std::vector<rocprofiler_record_counter_t>> read_pkt(
|
||||
const aql::AQLPacketConstruct* pkt_gen,
|
||||
hsa::AQLPacket& pkt);
|
||||
const aql::CounterPacketConstruct* pkt_gen,
|
||||
hsa::AQLPacket& pkt);
|
||||
|
||||
/**
|
||||
* @brief Insert special counter values, such as constants of the agent (i.e. max waves)
|
||||
|
||||
@@ -251,8 +251,8 @@ TEST(dimension, block_dim_test)
|
||||
}
|
||||
else
|
||||
{
|
||||
aql::AQLPacketConstruct pkt_gen(agent, {metric});
|
||||
const auto& events = pkt_gen.get_counter_events(metric);
|
||||
aql::CounterPacketConstruct pkt_gen(agent, {metric});
|
||||
const auto& events = pkt_gen.get_counter_events(metric);
|
||||
for(const auto& event : events)
|
||||
{
|
||||
std::map<int, uint64_t> dims;
|
||||
|
||||
@@ -21,14 +21,22 @@
|
||||
// THE SOFTWARE.
|
||||
|
||||
#include "lib/rocprofiler-sdk/hsa/aql_packet.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#define CHECK_HSA(fn, message) \
|
||||
if((fn) != HSA_STATUS_SUCCESS) \
|
||||
{ \
|
||||
std::cerr << __FILE__ << ':' << __LINE__ << ' ' << message; \
|
||||
exit(1); \
|
||||
}
|
||||
|
||||
namespace rocprofiler
|
||||
{
|
||||
namespace hsa
|
||||
{
|
||||
AQLPacket::~AQLPacket()
|
||||
CounterAQLPacket::~CounterAQLPacket()
|
||||
{
|
||||
if(!profile.command_buffer.ptr)
|
||||
{
|
||||
@@ -36,7 +44,7 @@ AQLPacket::~AQLPacket()
|
||||
}
|
||||
else if(!command_buf_mallocd)
|
||||
{
|
||||
free_func(profile.command_buffer.ptr);
|
||||
CHECK_HSA(free_func(profile.command_buffer.ptr), "freeing memory");
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -49,12 +57,61 @@ AQLPacket::~AQLPacket()
|
||||
}
|
||||
else if(!output_buffer_malloced)
|
||||
{
|
||||
free_func(profile.output_buffer.ptr);
|
||||
CHECK_HSA(free_func(profile.output_buffer.ptr), "freeing memory");
|
||||
}
|
||||
else
|
||||
{
|
||||
::free(profile.output_buffer.ptr);
|
||||
}
|
||||
}
|
||||
|
||||
TraceAQLPacket::~TraceAQLPacket() = default;
|
||||
/*
|
||||
TraceAQLPacket::~TraceAQLPacket()
|
||||
{
|
||||
aqlprofile_att_delete_packets(this->handle);
|
||||
}
|
||||
*/
|
||||
|
||||
hsa_status_t
|
||||
TraceAQLPacket::Alloc(void** ptr, size_t size, aqlprofile_buffer_desc_flags_t flags, void* data)
|
||||
{
|
||||
if(!data) return HSA_STATUS_ERROR;
|
||||
if(!reinterpret_cast<TraceAQLPacket*>(data)->tracepool) return HSA_STATUS_ERROR;
|
||||
|
||||
auto& pool = *reinterpret_cast<TraceAQLPacket*>(data)->tracepool;
|
||||
|
||||
if(!pool.allocate_fn || !pool.free_fn || !pool.allow_access_fn) return HSA_STATUS_ERROR;
|
||||
|
||||
if(flags.host_access)
|
||||
{
|
||||
hsa_status_t status = pool.allocate_fn(pool.cpu_pool_, size, 0, ptr);
|
||||
if(!flags.device_access || status != HSA_STATUS_SUCCESS) return status;
|
||||
return pool.allow_access_fn(1, &pool.gpu_agent, nullptr, *ptr);
|
||||
}
|
||||
return pool.allocate_fn(pool.gpu_pool_, size, 0, ptr);
|
||||
}
|
||||
|
||||
void
|
||||
TraceAQLPacket::Free(void* ptr, void* data)
|
||||
{
|
||||
auto* pool = reinterpret_cast<TraceAQLPacket*>(data)->tracepool.get();
|
||||
if(!pool || !pool->free_fn) return;
|
||||
|
||||
pool->free_fn(ptr);
|
||||
}
|
||||
|
||||
hsa_status_t
|
||||
TraceAQLPacket::Copy(void* dst, const void* src, size_t size, void* data)
|
||||
{
|
||||
auto* pool = reinterpret_cast<TraceAQLPacket*>(data)->tracepool.get();
|
||||
if(!pool || !pool->api_copy_fn) return HSA_STATUS_ERROR;
|
||||
|
||||
return pool->api_copy_fn(dst, src, size);
|
||||
}
|
||||
|
||||
TraceAQLPacket::TraceAQLPacket(std::shared_ptr<TraceMemoryPool>& _tracepool)
|
||||
: tracepool(_tracepool){};
|
||||
|
||||
} // namespace hsa
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "lib/common/container/small_vector.hpp"
|
||||
#include "lib/rocprofiler-sdk/aql/aql_profile_v2.h"
|
||||
|
||||
#include <hsa/hsa_ext_amd.h>
|
||||
#include <hsa/hsa_ven_amd_aqlprofile.h>
|
||||
@@ -30,6 +31,12 @@
|
||||
|
||||
namespace rocprofiler
|
||||
{
|
||||
namespace aql
|
||||
{
|
||||
class CounterPacketConstruct;
|
||||
class ThreadTraceAQLPacketFactory;
|
||||
} // namespace aql
|
||||
|
||||
namespace hsa
|
||||
{
|
||||
constexpr hsa_ext_amd_aql_pm4_packet_t null_amd_aql_pm4_packet = {
|
||||
@@ -41,12 +48,11 @@ constexpr hsa_ext_amd_aql_pm4_packet_t null_amd_aql_pm4_packet = {
|
||||
* Struct containing AQL packet information. Including start/stop/read
|
||||
* packets along with allocated buffers
|
||||
*/
|
||||
struct AQLPacket
|
||||
class AQLPacket
|
||||
{
|
||||
using memory_pool_free_func_t = decltype(::hsa_amd_memory_pool_free)*;
|
||||
|
||||
AQLPacket(memory_pool_free_func_t func);
|
||||
~AQLPacket();
|
||||
public:
|
||||
AQLPacket() = default;
|
||||
virtual ~AQLPacket() = default;
|
||||
|
||||
// Keep move constuctors (i.e. std::move())
|
||||
AQLPacket(AQLPacket&& other) = default;
|
||||
@@ -56,20 +62,68 @@ struct AQLPacket
|
||||
AQLPacket(const AQLPacket&) = delete;
|
||||
AQLPacket& operator=(const AQLPacket&) = delete;
|
||||
|
||||
bool command_buf_mallocd = false;
|
||||
bool output_buffer_malloced = false;
|
||||
bool empty = {true};
|
||||
hsa_ven_amd_aqlprofile_profile_t profile = {};
|
||||
hsa_ext_amd_aql_pm4_packet_t start = null_amd_aql_pm4_packet;
|
||||
hsa_ext_amd_aql_pm4_packet_t stop = null_amd_aql_pm4_packet;
|
||||
hsa_ext_amd_aql_pm4_packet_t read = null_amd_aql_pm4_packet;
|
||||
memory_pool_free_func_t free_func = nullptr;
|
||||
hsa_ven_amd_aqlprofile_profile_t profile = {};
|
||||
hsa_ext_amd_aql_pm4_packet_t start = null_amd_aql_pm4_packet;
|
||||
hsa_ext_amd_aql_pm4_packet_t stop = null_amd_aql_pm4_packet;
|
||||
hsa_ext_amd_aql_pm4_packet_t read = null_amd_aql_pm4_packet;
|
||||
|
||||
common::container::small_vector<hsa_ext_amd_aql_pm4_packet_t, 3> before_krn_pkt = {};
|
||||
common::container::small_vector<hsa_ext_amd_aql_pm4_packet_t, 2> after_krn_pkt = {};
|
||||
|
||||
bool isEmpty() const { return empty; }
|
||||
bool empty = true;
|
||||
};
|
||||
|
||||
class CounterAQLPacket : public AQLPacket
|
||||
{
|
||||
friend class rocprofiler::aql::CounterPacketConstruct;
|
||||
using memory_pool_free_func_t = decltype(::hsa_amd_memory_pool_free)*;
|
||||
|
||||
public:
|
||||
CounterAQLPacket(memory_pool_free_func_t func)
|
||||
: free_func{func} {};
|
||||
~CounterAQLPacket() override;
|
||||
|
||||
protected:
|
||||
bool command_buf_mallocd = false;
|
||||
bool output_buffer_malloced = false;
|
||||
memory_pool_free_func_t free_func = nullptr;
|
||||
};
|
||||
|
||||
struct TraceMemoryPool
|
||||
{
|
||||
hsa_agent_t gpu_agent;
|
||||
hsa_amd_memory_pool_t cpu_pool_;
|
||||
hsa_amd_memory_pool_t gpu_pool_;
|
||||
decltype(hsa_amd_memory_pool_allocate)* allocate_fn;
|
||||
decltype(hsa_amd_agents_allow_access)* allow_access_fn;
|
||||
decltype(hsa_amd_memory_pool_free)* free_fn;
|
||||
decltype(hsa_memory_copy)* api_copy_fn;
|
||||
};
|
||||
|
||||
class TraceAQLPacket : public AQLPacket
|
||||
{
|
||||
friend class rocprofiler::aql::ThreadTraceAQLPacketFactory;
|
||||
|
||||
public:
|
||||
TraceAQLPacket(std::shared_ptr<TraceMemoryPool>& _tracepool);
|
||||
TraceMemoryPool& GetPool() const { return *tracepool; }
|
||||
aqlprofile_handle_t GetHandle() const { return handle; }
|
||||
uint64_t GetAgent() const { return tracepool->gpu_agent.handle; }
|
||||
~TraceAQLPacket() override;
|
||||
|
||||
protected:
|
||||
std::shared_ptr<TraceMemoryPool> tracepool;
|
||||
aqlprofile_att_control_aql_packets_t packets;
|
||||
aqlprofile_handle_t handle;
|
||||
|
||||
static hsa_status_t Alloc(void** ptr,
|
||||
size_t size,
|
||||
aqlprofile_buffer_desc_flags_t flags,
|
||||
void* data);
|
||||
static void Free(void* ptr, void* data);
|
||||
static hsa_status_t Copy(void* dst, const void* src, size_t size, void* data);
|
||||
};
|
||||
|
||||
inline AQLPacket::AQLPacket(memory_pool_free_func_t func)
|
||||
: free_func{func}
|
||||
{}
|
||||
} // namespace hsa
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -672,10 +672,10 @@ code_object_load_callback(hsa_executable_t executable,
|
||||
if(_status != HSA_STATUS_SUCCESS) return _status; \
|
||||
}
|
||||
|
||||
auto& loader_table = get_loader_table();
|
||||
auto code_obj_v = code_object{};
|
||||
auto& data = code_obj_v.rocp_data;
|
||||
int _storage_type = ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_NONE;
|
||||
auto& loader_table = get_loader_table();
|
||||
auto code_obj_v = code_object{};
|
||||
auto& data = code_obj_v.rocp_data;
|
||||
uint32_t _storage_type = ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_NONE;
|
||||
|
||||
LOG_IF(FATAL, data.size == 0) << "code object did not properly initialized the size field upon "
|
||||
"construction (this is likely a compiler bug)";
|
||||
|
||||
@@ -342,8 +342,6 @@ WriteInterceptor(const void* packets,
|
||||
{
|
||||
hsa_barrier_and_packet_t barrier{};
|
||||
barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
|
||||
// barrier.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE;
|
||||
// barrier.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE;
|
||||
barrier.header |= 1 << HSA_PACKET_HEADER_BARRIER;
|
||||
barrier.completion_signal = original_packet.completion_signal;
|
||||
transformed_packets.emplace_back(barrier);
|
||||
|
||||
@@ -90,6 +90,9 @@ constexpr rocprofiler_agent_t default_agent =
|
||||
void
|
||||
QueueController::add_queue(hsa_queue_t* id, std::unique_ptr<Queue> queue)
|
||||
{
|
||||
for(auto& pre_initialize_fn : pre_initialize)
|
||||
pre_initialize_fn(queue->get_agent(), get_core_table(), get_ext_table());
|
||||
|
||||
CHECK(queue);
|
||||
_callback_cache.wlock([&](auto& callbacks) {
|
||||
_queues.wlock([&](auto& map) {
|
||||
@@ -111,6 +114,11 @@ void
|
||||
QueueController::destroy_queue(hsa_queue_t* id)
|
||||
{
|
||||
if(!id) return;
|
||||
_queues.wlock([&](auto& map) {
|
||||
for(auto& deinitialize_fn : pre_deinitialize)
|
||||
if(map.find(id) != map.end())
|
||||
deinitialize_fn(map.at(id)->get_agent(), get_core_table(), get_ext_table());
|
||||
});
|
||||
|
||||
const auto* queue = get_queue(*id);
|
||||
|
||||
@@ -195,7 +203,8 @@ QueueController::init(CoreApiTable& core_table, AmdExtTable& ext_table)
|
||||
{
|
||||
constexpr auto expected_context_size = 160UL;
|
||||
static_assert(
|
||||
sizeof(context::context) == expected_context_size,
|
||||
sizeof(context::context) ==
|
||||
expected_context_size + sizeof(std::shared_ptr<rocprofiler::ThreadTracer>),
|
||||
"If you added a new field to context struct, make sure there is a check here if it "
|
||||
"requires queue interception. Once you have done so, increment expected_context_size");
|
||||
|
||||
@@ -212,6 +221,20 @@ QueueController::init(CoreApiTable& core_table, AmdExtTable& ext_table)
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if(itr->thread_trace)
|
||||
{
|
||||
enable_intercepter = true;
|
||||
std::weak_ptr<rocprofiler::ThreadTracer> trace = itr->thread_trace;
|
||||
pre_initialize.emplace_back(
|
||||
[trace](const AgentCache& cache, const CoreApiTable& core, const AmdExtTable& ext) {
|
||||
if(auto locked = trace.lock()) locked->resource_init(cache, core, ext);
|
||||
});
|
||||
pre_deinitialize.emplace_back(
|
||||
[trace](const AgentCache& cache, const CoreApiTable&, const AmdExtTable&) {
|
||||
if(auto locked = trace.lock()) locked->resource_deinit(cache);
|
||||
});
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(enable_intercepter)
|
||||
|
||||
@@ -103,6 +103,7 @@ public:
|
||||
private:
|
||||
using client_id_map_t = std::unordered_map<ClientID, agent_callback_tuple_t>;
|
||||
using agent_cache_map_t = std::unordered_map<uint32_t, AgentCache>;
|
||||
using resource_alloc_t = void(const AgentCache&, const CoreApiTable&, const AmdExtTable&);
|
||||
|
||||
CoreApiTable _core_table = {};
|
||||
AmdExtTable _ext_table = {};
|
||||
@@ -110,6 +111,9 @@ private:
|
||||
common::Synchronized<client_id_map_t> _callback_cache = {};
|
||||
agent_cache_map_t _supported_agents = {};
|
||||
common::Synchronized<hsa::profiler_serializer> _profiler_serializer;
|
||||
|
||||
std::vector<std::function<resource_alloc_t>> pre_initialize;
|
||||
std::vector<std::function<resource_alloc_t>> pre_deinitialize;
|
||||
};
|
||||
|
||||
QueueController*
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
set(ROCPROFILER_LIB_THREAD_TRACE_SOURCES att_core.cpp att_service.cpp)
|
||||
set(ROCPROFILER_LIB_THREAD_TRACE_HEADERS att_core.hpp)
|
||||
target_sources(rocprofiler-object-library PRIVATE ${ROCPROFILER_LIB_THREAD_TRACE_SOURCES}
|
||||
${ROCPROFILER_LIB_THREAD_TRACE_HEADERS})
|
||||
|
||||
# if(ROCPROFILER_BUILD_TESTS) add_subdirectory(tests) endif()
|
||||
@@ -0,0 +1,238 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/intercept_table.h>
|
||||
#include <rocprofiler-sdk/rocprofiler.h>
|
||||
|
||||
#include "lib/common/container/stable_vector.hpp"
|
||||
#include "lib/common/utility.hpp"
|
||||
#include "lib/rocprofiler-sdk/buffer.hpp"
|
||||
#include "lib/rocprofiler-sdk/context/context.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/queue_controller.hpp"
|
||||
#include "lib/rocprofiler-sdk/internal_threading.hpp"
|
||||
#include "lib/rocprofiler-sdk/registration.hpp"
|
||||
|
||||
#include <hsa/hsa_api_trace.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
#include <mutex>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#define CHECK_HSA(fn, message) \
|
||||
{ \
|
||||
auto _status = (fn); \
|
||||
if(_status != HSA_STATUS_SUCCESS) \
|
||||
{ \
|
||||
LOG(ERROR) << "HSA Err: " << _status << '\n'; \
|
||||
throw std::runtime_error(message); \
|
||||
} \
|
||||
}
|
||||
|
||||
namespace rocprofiler
|
||||
{
|
||||
using AQLPacketOwner = std::unique_ptr<hsa::AQLPacket>;
|
||||
using inst_pkt_t = common::container::small_vector<std::pair<AQLPacketOwner, int64_t>, 4>;
|
||||
using corr_id_map_t = hsa::Queue::queue_info_session_t::external_corr_id_map_t;
|
||||
|
||||
struct cbdata_t
|
||||
{
|
||||
void* tool_userdata;
|
||||
rocprofiler_att_shader_data_callback_t cb_fn;
|
||||
std::vector<uint8_t>* memory_space;
|
||||
};
|
||||
|
||||
/**
|
||||
* Callback we get from HSA interceptor when a kernel packet is being enqueued.
|
||||
* We return an AQLPacket containing the start/stop/read packets for injection.
|
||||
*/
|
||||
AQLPacketOwner
|
||||
pre_kernel_call(ThreadTracer& tracer,
|
||||
const hsa::Queue& queue,
|
||||
const hsa::rocprofiler_packet& kern_pkt,
|
||||
uint64_t kernel_id,
|
||||
const corr_id_map_t& extern_corr_ids,
|
||||
const context::correlation_id* corr_id)
|
||||
{
|
||||
(void) extern_corr_ids;
|
||||
(void) corr_id;
|
||||
|
||||
rocprofiler_correlation_id_t temp_corr_id;
|
||||
temp_corr_id.internal = 0;
|
||||
temp_corr_id.external.value = 0;
|
||||
temp_corr_id.external.ptr = nullptr;
|
||||
|
||||
auto control_flags = tracer.params->dispatch_cb_fn(queue.get_id(),
|
||||
queue.get_agent().get_rocp_agent(),
|
||||
temp_corr_id,
|
||||
&kern_pkt.kernel_dispatch,
|
||||
kernel_id,
|
||||
tracer.params->callback_userdata);
|
||||
|
||||
if(control_flags == ROCPROFILER_ATT_CONTROL_NONE) return nullptr;
|
||||
|
||||
assert(control_flags == ROCPROFILER_ATT_CONTROL_START_AND_STOP && "Error: Not implemented");
|
||||
|
||||
uint64_t agent = queue.get_agent().get_hsa_agent().handle;
|
||||
std::lock_guard<std::mutex> lk(tracer.trace_resources_mut);
|
||||
|
||||
try
|
||||
{
|
||||
auto moved = std::move(tracer.resources.at(agent));
|
||||
tracer.resources.erase(agent);
|
||||
return moved;
|
||||
} catch(std::out_of_range& e)
|
||||
{
|
||||
LOG(WARNING) << "Attempt to initialize ATT without allocated resources!\n";
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
hsa_status_t
|
||||
thread_trace_callback(uint32_t shader, void* buffer, uint64_t size, void* callback_data)
|
||||
{
|
||||
void* tool_userdata = static_cast<cbdata_t*>(callback_data)->tool_userdata;
|
||||
auto callback_fn = *static_cast<cbdata_t*>(callback_data)->cb_fn;
|
||||
std::vector<uint8_t>& cpu_data = *static_cast<cbdata_t*>(callback_data)->memory_space;
|
||||
|
||||
// TODO(gbaraldi): Handle parallel callbacks
|
||||
static std::mutex mut;
|
||||
std::lock_guard<std::mutex> lk(mut);
|
||||
|
||||
if(size > cpu_data.size()) cpu_data.resize(size + cpu_data.size());
|
||||
|
||||
auto status = hsa::get_queue_controller()->get_core_table().hsa_memory_copy_fn(
|
||||
cpu_data.data(), buffer, size);
|
||||
if(status != HSA_STATUS_SUCCESS)
|
||||
{
|
||||
LOG(WARNING) << "Failed to copy hsa memory!";
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
callback_fn(shader, 0, "", cpu_data.data(), size, tool_userdata);
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void
|
||||
post_kernel_call(ThreadTracer& tracer, inst_pkt_t& aql)
|
||||
{
|
||||
std::vector<uint8_t> cpu_data{};
|
||||
auto pair = cbdata_t{tracer.params->callback_userdata, tracer.params->shader_cb_fn, &cpu_data};
|
||||
(void) pair;
|
||||
|
||||
for(auto& aql_pkt : aql)
|
||||
{
|
||||
auto* pkt = dynamic_cast<hsa::TraceAQLPacket*>(aql_pkt.first.get());
|
||||
if(!pkt) continue;
|
||||
|
||||
// auto status = aqlprofile_att_iterate_data(pkt->GetHandle(), thread_trace_callback,
|
||||
// &pair); CHECK_HSA(status, "Failed to iterate ATT data");
|
||||
|
||||
std::lock_guard<std::mutex> lk(tracer.trace_resources_mut);
|
||||
if(tracer.agent_active_queues.find(pkt->GetAgent()) != tracer.agent_active_queues.end())
|
||||
tracer.resources[pkt->GetAgent()] = std::move(aql_pkt.first);
|
||||
}
|
||||
}
|
||||
|
||||
common::Synchronized<std::optional<int64_t>> client;
|
||||
|
||||
void
|
||||
ThreadTracer::start_context()
|
||||
{
|
||||
// Only one thread should be attempting to enable/disable this context
|
||||
client.wlock([&](auto& client_id) {
|
||||
if(client_id) return;
|
||||
|
||||
client_id = hsa::get_queue_controller()->add_callback(
|
||||
std::nullopt,
|
||||
[=](const hsa::Queue& q,
|
||||
const hsa::rocprofiler_packet& kern_pkt,
|
||||
uint64_t kernel_id,
|
||||
rocprofiler_user_data_t* user_data,
|
||||
const hsa::Queue::queue_info_session_t::external_corr_id_map_t& extern_corr_ids,
|
||||
const context::correlation_id* corr_id) {
|
||||
(void) user_data;
|
||||
return pre_kernel_call(*this, q, kern_pkt, kernel_id, extern_corr_ids, corr_id);
|
||||
},
|
||||
[=](const hsa::Queue& q,
|
||||
hsa::rocprofiler_packet kern_pkt,
|
||||
const hsa::Queue::queue_info_session_t& session,
|
||||
inst_pkt_t& aql) {
|
||||
post_kernel_call(*this, aql);
|
||||
(void) session;
|
||||
(void) kern_pkt;
|
||||
(void) q;
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
void
|
||||
ThreadTracer::stop_context()
|
||||
{
|
||||
client.wlock([&](auto& client_id) {
|
||||
if(!client_id) return;
|
||||
|
||||
// Remove our callbacks from HSA's queue controller
|
||||
hsa::get_queue_controller()->remove_callback(*client_id);
|
||||
client_id = std::nullopt;
|
||||
});
|
||||
}
|
||||
|
||||
void
|
||||
ThreadTracer::resource_init(const hsa::AgentCache& cache,
|
||||
const CoreApiTable& coreapi,
|
||||
const AmdExtTable& ext)
|
||||
{
|
||||
uint64_t agent = cache.get_hsa_agent().handle;
|
||||
std::lock_guard<std::mutex> lk(trace_resources_mut);
|
||||
|
||||
if(agent_active_queues.find(agent) != agent_active_queues.end())
|
||||
{
|
||||
agent_active_queues.at(agent).fetch_add(1);
|
||||
return;
|
||||
}
|
||||
|
||||
auto factory = aql::ThreadTraceAQLPacketFactory(cache, this->params, coreapi, ext);
|
||||
resources[agent] = factory.construct_packet();
|
||||
agent_active_queues[agent] = 1;
|
||||
}
|
||||
|
||||
void
|
||||
ThreadTracer::resource_deinit(const hsa::AgentCache& cache)
|
||||
{
|
||||
uint64_t agent = cache.get_hsa_agent().handle;
|
||||
std::lock_guard<std::mutex> lk(trace_resources_mut);
|
||||
|
||||
try
|
||||
{
|
||||
if(agent_active_queues.at(agent).fetch_add(-1) > 1) return;
|
||||
} catch(std::out_of_range&)
|
||||
{}
|
||||
|
||||
agent_active_queues.erase(agent);
|
||||
resources.erase(agent);
|
||||
}
|
||||
|
||||
} // namespace rocprofiler
|
||||
@@ -0,0 +1,75 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <rocprofiler-sdk/intercept_table.h>
|
||||
#include <tuple>
|
||||
#include "include/rocprofiler-sdk/thread_trace.h"
|
||||
|
||||
namespace rocprofiler
|
||||
{
|
||||
struct thread_trace_parameters
|
||||
{
|
||||
rocprofiler_context_id_t context_id;
|
||||
rocprofiler_att_dispatch_callback_t dispatch_cb_fn;
|
||||
rocprofiler_att_shader_data_callback_t shader_cb_fn;
|
||||
void* callback_userdata;
|
||||
|
||||
// Parameters
|
||||
rocprofiler_att_parameter_flag_t flags;
|
||||
uint64_t buffer_size;
|
||||
uint8_t target_cu;
|
||||
uint8_t simd_select;
|
||||
uint8_t reserved;
|
||||
uint8_t vmid_mask;
|
||||
uint16_t perfcounter_mask;
|
||||
uint8_t perfcounter_ctrl;
|
||||
uint64_t shader_engine_mask;
|
||||
|
||||
// GFX9 Only
|
||||
std::vector<std::string> perfcounters;
|
||||
};
|
||||
|
||||
namespace hsa
|
||||
{
|
||||
class AQLPacket;
|
||||
};
|
||||
|
||||
class ThreadTracer
|
||||
{
|
||||
public:
|
||||
ThreadTracer(std::shared_ptr<thread_trace_parameters>& _params)
|
||||
: params(_params){};
|
||||
virtual void start_context();
|
||||
virtual void stop_context();
|
||||
virtual void resource_init(const hsa::AgentCache&, const CoreApiTable&, const AmdExtTable&);
|
||||
virtual void resource_deinit(const hsa::AgentCache&);
|
||||
virtual ~ThreadTracer() = default;
|
||||
|
||||
std::shared_ptr<thread_trace_parameters> params;
|
||||
std::mutex trace_resources_mut;
|
||||
std::unordered_map<uint64_t, std::unique_ptr<hsa::AQLPacket>> resources;
|
||||
std::unordered_map<uint64_t, std::atomic<int>> agent_active_queues;
|
||||
}; // namespace thread_trace
|
||||
|
||||
} // namespace rocprofiler
|
||||
@@ -0,0 +1,81 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include <rocprofiler-sdk/rocprofiler.h>
|
||||
|
||||
#include "lib/rocprofiler-sdk/aql/helpers.hpp"
|
||||
#include "lib/rocprofiler-sdk/context/context.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/agent_cache.hpp"
|
||||
|
||||
extern "C" {
|
||||
/**
|
||||
* @brief Configure buffered dispatch profile Counting Service.
|
||||
* Collects the counters in dispatch packets and stores them
|
||||
* in buffer_id. The buffer may contain packets from more than
|
||||
* one dispatch (denoted by correlation id). Will trigger the
|
||||
* callback based on the parameters setup in buffer_id_t.
|
||||
*
|
||||
* @param [in] context_id context id
|
||||
* @param [in] buffer_id id of the buffer to use for the counting service
|
||||
* @param [in] profile profile config to use for dispatch
|
||||
* @return ::rocprofiler_status_t
|
||||
*/
|
||||
rocprofiler_status_t ROCPROFILER_API
|
||||
rocprofiler_configure_thread_trace_service(rocprofiler_context_id_t context_id,
|
||||
rocprofiler_att_parameters_t parameters,
|
||||
rocprofiler_att_dispatch_callback_t dispatch_callback,
|
||||
rocprofiler_att_shader_data_callback_t shader_callback,
|
||||
void* callback_userdata)
|
||||
{
|
||||
auto* ctx = rocprofiler::context::get_mutable_registered_context(context_id);
|
||||
if(!ctx) return ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_STARTED;
|
||||
if(ctx->thread_trace) return ROCPROFILER_STATUS_ERROR_SERVICE_ALREADY_CONFIGURED;
|
||||
if(parameters.flags.raw != 0) return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED;
|
||||
|
||||
auto thread_tracer = std::make_shared<rocprofiler::thread_trace_parameters>();
|
||||
|
||||
thread_tracer->context_id = context_id;
|
||||
thread_tracer->dispatch_cb_fn = dispatch_callback;
|
||||
thread_tracer->shader_cb_fn = shader_callback;
|
||||
thread_tracer->callback_userdata = callback_userdata;
|
||||
|
||||
thread_tracer->flags = parameters.flags;
|
||||
thread_tracer->buffer_size = parameters.buffer_size;
|
||||
thread_tracer->target_cu = parameters.target_cu;
|
||||
thread_tracer->simd_select = parameters.simd_select;
|
||||
thread_tracer->vmid_mask = parameters.vmid_mask;
|
||||
|
||||
thread_tracer->perfcounter_mask = parameters.perfcounter_mask;
|
||||
thread_tracer->perfcounter_ctrl = parameters.perfcounter_ctrl;
|
||||
|
||||
for(int i = 0; i < parameters.perfcounter_num; i++)
|
||||
thread_tracer->perfcounters.emplace_back(parameters.perfcounter[i]);
|
||||
|
||||
thread_tracer->shader_engine_mask = 0;
|
||||
for(int i = 0; i < parameters.shader_num; i++)
|
||||
thread_tracer->shader_engine_mask |= 1ul << parameters.shader_ids[i];
|
||||
|
||||
ctx->thread_trace = std::make_shared<rocprofiler::ThreadTracer>(thread_tracer);
|
||||
|
||||
return ROCPROFILER_STATUS_SUCCESS;
|
||||
}
|
||||
}
|
||||
@@ -127,9 +127,8 @@ run(int tid, int devid)
|
||||
{
|
||||
auto roctx_range_id = roctxRangeStart("run");
|
||||
|
||||
constexpr int min_sa = 8;
|
||||
constexpr int min_avail_simd = 24;
|
||||
dim3 grid(min_sa * min_avail_simd);
|
||||
constexpr int min_avail_simd = 128;
|
||||
dim3 grid(min_avail_simd);
|
||||
dim3 block(32);
|
||||
double time = 0.0;
|
||||
hipStream_t stream = {};
|
||||
|
||||
مرجع در شماره جدید
Block a user