diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 5539961fa4..870305dc64 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -29,5 +29,6 @@ add_subdirectory(api_buffered_tracing) add_subdirectory(code_object_tracing) add_subdirectory(counter_collection) add_subdirectory(intercept_table) +add_subdirectory(code_object_isa_decode) +add_subdirectory(advanced_thread_trace) add_subdirectory(external_correlation_id_request) -# add_subdirectory(code_object_isa_decode) add_subdirectory(advanced_thread_trace) diff --git a/samples/advanced_thread_trace/CMakeLists.txt b/samples/advanced_thread_trace/CMakeLists.txt index aded15bd51..8dfd609e50 100644 --- a/samples/advanced_thread_trace/CMakeLists.txt +++ b/samples/advanced_thread_trace/CMakeLists.txt @@ -26,23 +26,23 @@ foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) endforeach() find_package(rocprofiler-sdk REQUIRED) +find_package(amd_comgr REQUIRED) add_library(advanced-thread-trace-client SHARED) target_sources(advanced-thread-trace-client PRIVATE client.cpp) + target_link_libraries( advanced-thread-trace-client PRIVATE rocprofiler::rocprofiler rocprofiler::samples-build-flags - rocprofiler-sdk-codeobj rocprofiler::samples-common-library) + rocprofiler-sdk-codeobj rocprofiler::samples-common-library amd_comgr dw) set_source_files_properties(main.cpp PROPERTIES LANGUAGE HIP) set_source_files_properties(main.cpp PROPERTIES COMPILE_FLAGS "-g") -find_package(Threads REQUIRED) add_executable(advanced-thread-trace) target_sources(advanced-thread-trace PRIVATE main.cpp) -target_link_libraries( - advanced-thread-trace PRIVATE advanced-thread-trace-client Threads::Threads - rocprofiler::samples-build-flags) +target_link_libraries(advanced-thread-trace PRIVATE advanced-thread-trace-client + rocprofiler::samples-build-flags) rocprofiler_samples_get_preload_env(PRELOAD_ENV advanced-thread-trace-client) diff --git a/samples/advanced_thread_trace/client.cpp b/samples/advanced_thread_trace/client.cpp index cc43c06165..15d1169d2a 100644 --- a/samples/advanced_thread_trace/client.cpp +++ b/samples/advanced_thread_trace/client.cpp @@ -36,12 +36,11 @@ #include #include #include -#include "lib/rocprofiler-sdk/aql/aql_profile_v2.h" +#include +#include -#include "code_object_track.hpp" #include "common/defines.hpp" #include "common/filesystem.hpp" -#include "lib/rocprofiler-sdk-codeobj/code_printing.hpp" #include #include @@ -64,19 +63,28 @@ #include #include -#define TARGET_CU 1 -#define SIMD_SELECT 0x3 -#define BUFFER_SIZE 0x6000000 -#define NUM_SE 2 -constexpr bool COPY_MEMORY_CODEOBJ = false; +#define OUTPUT_OFSTREAM "advanced_thread_trace.log" +#define TARGET_CU 1 +#define SIMD_SELECT 0x3 +#define BUFFER_SIZE 0x6000000 +#define SE_MASK 0x11 +constexpr bool COPY_MEMORY_CODEOBJ = true; -template <> -struct std::hash -{ - uint64_t operator()(const pcinfo_t& info) const +#define C_API_BEGIN \ + try \ { - return info.addr ^ (info.marker_id << 32ul) ^ (info.marker_id >> 32ul); - } +#define C_API_END \ + } \ + catch(std::exception & e) \ + { \ + std::cerr << "Error in " << __FILE__ << ':' << __LINE__ << ' ' << e.what() << std::endl; \ + } \ + catch(...) { std::cerr << "Error in " << __FILE__ << ':' << __LINE__ << std::endl; } + +struct pcinfo_t +{ + uint64_t marker_id; + uint64_t addr; }; bool @@ -97,6 +105,10 @@ namespace client using code_obj_load_data_t = rocprofiler_callback_tracing_code_object_load_data_t; using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; +using Instruction = rocprofiler::codeobj::disassembly::Instruction; +using CodeobjAddressTranslate = rocprofiler::codeobj::disassembly::CodeobjAddressTranslate; +using SymbolInfo = rocprofiler::codeobj::disassembly::SymbolInfo; + rocprofiler_client_id_t* client_id = nullptr; rocprofiler_context_id_t client_ctx = {}; @@ -109,13 +121,35 @@ struct isa_map_elem_t struct ToolData { + ToolData() + { + try + { + output_file.open(OUTPUT_OFSTREAM); + } catch(...) + {} + + if(output_file.is_open()) + std::cout << "Writing code-object-isa-decode log to: " << OUTPUT_OFSTREAM << std::endl; + else + std::cout << "Could not open log file: " << OUTPUT_OFSTREAM << ", writing to stdout\n"; + }; + std::shared_mutex isa_map_mut; std::mutex output_mut; CodeobjAddressTranslate codeobjTranslate; std::map> isa_map; std::unordered_map kernels_in_codeobj = {}; std::unordered_map kernel_object_to_kernel_name = {}; - std::stringstream output; + int num_waves = 0; + + std::ostream& output() + { + if(output_file.is_open()) + return output_file; + else + return std::cout; + } std::stringstream printKernel(uint64_t vaddr) { @@ -133,6 +167,9 @@ struct ToolData } return ss; } + +private: + std::ofstream output_file; }; struct source_location @@ -151,15 +188,12 @@ struct trace_data_t ToolData* tool; }; -std::atomic TRACE_DATA_ID{-1}; -std::atomic KERNEL_ADDR_ID{-1}; -std::atomic OCCUPANCY_ID{-1}; - void tool_codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record, rocprofiler_user_data_t* user_data, void* callback_data) { + C_API_BEGIN if(record.kind != ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT) return; if(record.phase != ROCPROFILER_CALLBACK_PHASE_LOAD) return; @@ -200,19 +234,18 @@ tool_codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record, (void) user_data; (void) callback_data; + C_API_END } -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" - rocprofiler_att_control_flags_t -dispatch_callback(rocprofiler_queue_id_t queue_id, - const rocprofiler_agent_t* agent, - rocprofiler_correlation_id_t correlation_id, +dispatch_callback(rocprofiler_queue_id_t /* queue_id */, + const rocprofiler_agent_t* /* agent */, + rocprofiler_correlation_id_t /* correlation_id */, const hsa_kernel_dispatch_packet_t* dispatch_packet, - uint64_t kernel_id, - void* userdata) + uint64_t /* kernel_id */, + void* userdata) { + C_API_BEGIN assert(userdata && "Dispatch callback passed null!"); ToolData& tool = *reinterpret_cast(userdata); @@ -236,82 +269,55 @@ dispatch_callback(rocprofiler_queue_id_t queue_id, << std::endl; } + C_API_END return ROCPROFILER_ATT_CONTROL_NONE; } void -iterate_trace_types(int id, const char* metadata, void*) -{ - if(std::string_view(metadata).find("occupancy") == 0) - OCCUPANCY_ID.store(id); - else if(std::string_view(metadata).find("kernel_ids_addr") == 0) - KERNEL_ADDR_ID.store(id); - else if(std::string_view(metadata).find("tracedata") == 0) - TRACE_DATA_ID.store(id); -} - -hsa_status_t -get_trace_data(int trace_type_id, - int correlation_id, - void* trace_events, - uint64_t trace_size, - void* userdata) +get_trace_data(rocprofiler_att_parser_data_type_t type, void* att_data, void* userdata) { + C_API_BEGIN assert(userdata && "ISA callback passed null!"); trace_data_t& trace_data = *reinterpret_cast(userdata); assert(trace_data.tool && "ISA callback passed null!"); ToolData& tool = *reinterpret_cast(trace_data.tool); - std::stringstream ss; std::shared_lock shared_lock(tool.isa_map_mut); - if(trace_type_id == OCCUPANCY_ID) - { - ss << "Num waves: " << trace_size / 2 << '\n'; - // auto* occ = reinterpret_cast(trace_events); - } - else if(trace_type_id == KERNEL_ADDR_ID) - { - ss << "Num KRN events: " << trace_size << std::hex << '\n'; - auto* kaddr = reinterpret_cast(trace_events); - for(size_t i = 0; i < trace_size; i++) - if(kaddr[i].addr != 0) - { - ss << " - ADDR: " << kaddr[i].addr << ' ' << tool.printKernel(kaddr[i].addr).str() - << '\n'; - } - ss << std::dec; - } - else if(trace_type_id == TRACE_DATA_ID) - { - ss << "Trace Length: " << trace_size << '\n'; - auto* tracedata = reinterpret_cast(trace_events); + if(type == ROCPROFILER_ATT_PARSER_DATA_TYPE_OCCUPANCY) tool.num_waves++; - for(size_t i = 0; i < trace_size; i++) + if(type != ROCPROFILER_ATT_PARSER_DATA_TYPE_ISA) return; + + auto& event = *reinterpret_cast(att_data); + + pcinfo_t pc{event.marker_id, event.offset}; + auto it = tool.isa_map.find(pc); + if(it == tool.isa_map.end()) + { + shared_lock.unlock(); { - pcinfo_t pc = tracedata[i].pc; - auto it = tool.isa_map.find(pc); - if(it == tool.isa_map.end()) + std::unique_lock unique_lock(tool.isa_map_mut); + auto ptr = std::make_unique(); + try { - shared_lock.unlock(); - { - std::unique_lock unique_lock(tool.isa_map_mut); - auto ptr = std::make_unique(); - ptr->code_line = tool.codeobjTranslate.get(pc.marker_id, pc.addr); - it = tool.isa_map.emplace(pc, std::move(ptr)).first; - } - shared_lock.lock(); + ptr->code_line = tool.codeobjTranslate.get(pc.marker_id, pc.addr); + } catch(std::exception& e) + { + std::cerr << pc.marker_id << ":" << pc.addr << ' ' << e.what() << std::endl; + return; + } catch(...) + { + std::cerr << "Could not fetch: " << pc.marker_id << ':' << pc.addr << std::endl; + return; } - - it->second->hitcount.fetch_add(tracedata[i].hitcount, std::memory_order_relaxed); - it->second->latency.fetch_add(tracedata[i].latency, std::memory_order_relaxed); + it = tool.isa_map.emplace(pc, std::move(ptr)).first; } + shared_lock.lock(); } - std::unique_lock lk(tool.output_mut); - tool.output << ss.str(); - - return HSA_STATUS_SUCCESS; + it->second->hitcount.fetch_add(event.hitcount, std::memory_order_relaxed); + it->second->latency.fetch_add(event.latency, std::memory_order_relaxed); + C_API_END } uint64_t @@ -325,16 +331,15 @@ copy_trace_data(int* seid, uint8_t** buffer, uint64_t* buffer_size, void* userda return *buffer_size; } -hsa_status_t +rocprofiler_status_t isa_callback(char* isa_instruction, - char* source_reference, uint64_t* isa_memory_size, uint64_t* isa_size, - uint64_t* source_size, uint64_t marker_id, uint64_t offset, void* userdata) { + C_API_BEGIN assert(userdata && "ISA callback passed null!"); trace_data_t& trace_data = *reinterpret_cast(userdata); assert(trace_data.tool && "ISA callback passed null!"); @@ -347,56 +352,47 @@ isa_callback(char* isa_instruction, instruction = tool.codeobjTranslate.get(marker_id, offset); } - if(!instruction.get()) return HSA_STATUS_ERROR_INVALID_ARGUMENT; + if(!instruction.get()) return ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT; { - size_t tmp_isa_size = *isa_size; - size_t tmp_source_size = *source_size; - *isa_size = instruction->inst.size(); - *source_size = instruction->comment.size(); + size_t tmp_isa_size = *isa_size; + *isa_size = instruction->inst.size(); - if(*isa_size > tmp_isa_size || *source_size > tmp_source_size) - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + if(*isa_size > tmp_isa_size) return ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES; } memcpy(isa_instruction, instruction->inst.data(), *isa_size); - memcpy(source_reference, instruction->comment.data(), *source_size); *isa_memory_size = instruction->size; auto ptr = std::make_unique(); ptr->code_line = std::move(instruction); - tool.isa_map.emplace(pcinfo_t{offset, marker_id}, std::move(ptr)); - - return HSA_STATUS_SUCCESS; + tool.isa_map.emplace(pcinfo_t{marker_id, offset}, std::move(ptr)); + C_API_END + return ROCPROFILER_STATUS_SUCCESS; } void -shader_data_callback(int64_t se_id, - int64_t data_type_id, - const char* data_type_name, - void* se_data, - size_t data_size, - void* userdata) +shader_data_callback(int64_t se_id, void* se_data, size_t data_size, void* userdata) { + C_API_BEGIN assert(userdata && "Shader callback passed null!"); ToolData& tool = *reinterpret_cast(userdata); { std::unique_lock lk(tool.output_mut); - tool.output << "SE ID: " << se_id << " with size " << data_size << std::hex << '\n'; + tool.output() << "SE ID: " << se_id << " with size " << data_size << std::hex << '\n'; } trace_data_t data{.id = se_id, .data = (uint8_t*) se_data, .size = data_size, .tool = &tool}; - auto status = aqlprofile_att_parse_data(copy_trace_data, get_trace_data, isa_callback, &data); - (void) status; + auto status = rocprofiler_att_parse_data(copy_trace_data, get_trace_data, isa_callback, &data); + if(status != ROCPROFILER_STATUS_SUCCESS) + std::cerr << "shader_data_callback failed with status " << status << std::endl; + C_API_END } -#pragma GCC diagnostic pop - int tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) { (void) fini_func; - aqlprofile_att_parser_iterate_event_list(iterate_trace_types, nullptr); ROCPROFILER_CALL(rocprofiler_create_context(&client_ctx), "context creation"); ROCPROFILER_CALL( @@ -408,22 +404,19 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) tool_data), "code object tracing service configure"); - rocprofiler_att_parameters_t parameters{}; - parameters.target_cu = TARGET_CU; - parameters.simd_select = SIMD_SELECT; - parameters.buffer_size = BUFFER_SIZE; + std::vector parameters; + parameters.push_back({ROCPROFILER_ATT_PARAMETER_TARGET_CU, TARGET_CU}); + parameters.push_back({ROCPROFILER_ATT_PARAMETER_SIMD_SELECT, SIMD_SELECT}); + parameters.push_back({ROCPROFILER_ATT_PARAMETER_BUFFER_SIZE, BUFFER_SIZE}); + parameters.push_back({ROCPROFILER_ATT_PARAMETER_SHADER_ENGINE_MASK, SE_MASK}); - std::vector shaders; - for(size_t i = 0; i < NUM_SE; i++) - shaders.push_back(2 * i); // use shader engines 0, 2 - - parameters.shader_ids = shaders.data(); - parameters.shader_num = shaders.size(); - - ROCPROFILER_CALL( - rocprofiler_configure_thread_trace_service( - client_ctx, parameters, dispatch_callback, shader_data_callback, tool_data), - "thread trace service configure"); + ROCPROFILER_CALL(rocprofiler_configure_thread_trace_service(client_ctx, + parameters.data(), + parameters.size(), + dispatch_callback, + shader_data_callback, + tool_data), + "thread trace service configure"); int valid_ctx = 0; ROCPROFILER_CALL(rocprofiler_context_is_valid(client_ctx, &valid_ctx), @@ -476,9 +469,9 @@ tool_fini(void* tool_data) size_t latency = line->latency.load(std::memory_order_relaxed); auto& code_line = line->code_line->inst; - tool.output << std::hex << "0x" << addr.addr << std::dec << ' ' << code_line - << empty_space.substr(0, max_inst_size - code_line.size()) - << " Hit: " << hitcount << " - Latency: " << latency << '\n'; + tool.output() << std::hex << "0x" << addr.addr << std::dec << ' ' << code_line + << empty_space.substr(0, max_inst_size - code_line.size()) + << " Hit: " << hitcount << " - Latency: " << latency << '\n'; if(code_line.find("s_waitcnt") == 0) { @@ -511,30 +504,16 @@ tool_fini(void* tool_data) float vmc_fraction = 100 * vmc_latency / float(total_latency); float lgk_fraction = 100 * lgk_latency / float(total_latency); - tool.output << "Total executed instructions: " << total_exec << '\n' - << "Total executed vector instructions: " << vector_exec << " with average " - << vector_latency / float(vector_exec) << " cycles.\n" - << "Total executed scalar instructions: " << scalar_exec << " with average " - << scalar_latency / float(scalar_exec) << " cycles.\n" - << "Vector memory ops occupied: " << vmc_fraction << "% of cycles.\n" - << "Scalar and LDS memory ops occupied: " << lgk_fraction << "% of cycles.\n"; - - std::cout << tool.output.str(); + tool.output() << "Total executed instructions: " << total_exec << '\n' + << "Total executed vector instructions: " << vector_exec << " with average " + << vector_latency / float(vector_exec) << " cycles.\n" + << "Total executed scalar instructions: " << scalar_exec << " with average " + << scalar_latency / float(scalar_exec) << " cycles.\n" + << "Vector memory ops occupied: " << vmc_fraction << "% of cycles.\n" + << "Scalar and LDS memory ops occupied: " << lgk_fraction << "% of cycles.\n" + << std::endl; } -void -setup() -{ - if(int status = 0; - rocprofiler_is_initialized(&status) == ROCPROFILER_STATUS_SUCCESS && status == 0) - { - ROCPROFILER_CALL(rocprofiler_force_configure(&rocprofiler_configure), - "force configuration"); - } -} - -// force configuration when library is loaded -bool cfg_on_load = (client::setup(), true); } // namespace client extern "C" rocprofiler_tool_configure_result_t* diff --git a/samples/code_object_isa_decode/CMakeLists.txt b/samples/code_object_isa_decode/CMakeLists.txt index 1a380d2b77..c873f3e74b 100644 --- a/samples/code_object_isa_decode/CMakeLists.txt +++ b/samples/code_object_isa_decode/CMakeLists.txt @@ -26,21 +26,28 @@ foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) endforeach() find_package(rocprofiler-sdk REQUIRED) -find_package(Threads REQUIRED) +find_package(amd_comgr REQUIRED) -add_executable(code-object-isa-decode) -target_sources(code-object-isa-decode PRIVATE main.cpp client.cpp) +add_library(code-object-isa-decode-client SHARED) +target_sources(code-object-isa-decode-client PRIVATE client.cpp) set_source_files_properties(main.cpp PROPERTIES LANGUAGE HIP) set_source_files_properties(main.cpp PROPERTIES COMPILE_FLAGS "-g") target_link_libraries( - code-object-isa-decode PRIVATE Threads::Threads rocprofiler::samples-common-library - rocprofiler-sdk-codeobj rocprofiler::rocprofiler) + code-object-isa-decode-client + PRIVATE rocprofiler::samples-common-library rocprofiler-sdk-codeobj + rocprofiler::rocprofiler amd_comgr dw) + +rocprofiler_samples_get_preload_env(PRELOAD_ENV code-object-isa-decode-client) + +add_executable(code-object-isa-decode) +target_sources(code-object-isa-decode PRIVATE main.cpp) +target_link_libraries(code-object-isa-decode PRIVATE code-object-isa-decode-client + rocprofiler::samples-build-flags) add_test(NAME code-object-isa-decode COMMAND $) set_tests_properties( code-object-isa-decode - PROPERTIES TIMEOUT 45 LABELS "samples" ENVIRONMENT - "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}" FAIL_REGULAR_EXPRESSION - "threw an exception") + PROPERTIES TIMEOUT 45 LABELS "samples" ENVIRONMENT "${PRELOAD_ENV}" + FAIL_REGULAR_EXPRESSION "threw an exception") diff --git a/samples/code_object_isa_decode/client.cpp b/samples/code_object_isa_decode/client.cpp index 2f2c491df6..b554c6f5c0 100644 --- a/samples/code_object_isa_decode/client.cpp +++ b/samples/code_object_isa_decode/client.cpp @@ -25,6 +25,8 @@ # undef NDEBUG #endif +#define OUTPUT_OFSTREAM "code_obj_isa_decode.log" + /** * @file samples/code_object_isa_decode/client.cpp * @@ -36,7 +38,7 @@ #include #include #include -#include "lib/rocprofiler-sdk-codeobj/code_printing.hpp" +#include #include "common/defines.hpp" #include "common/filesystem.hpp" @@ -60,16 +62,37 @@ #include #include #include -#include "code_object_track.hpp" + +constexpr bool COPY_MEMORY_CODEOBJ = true; namespace client { +std::ostream& +output_stream() +{ + static std::ofstream file(OUTPUT_OFSTREAM); + + static bool file_is_open_check = [&]() { + if(!file.is_open()) + std::cout << "Could not open log file: " << OUTPUT_OFSTREAM << ", writing to stdout\n"; + else + std::cout << "Writing code-object-isa-decode log to: " << OUTPUT_OFSTREAM << std::endl; + return file.is_open(); + }(); + + if(!file_is_open_check) return std::cout; + return file; +}; + namespace { using code_obj_load_data_t = rocprofiler_callback_tracing_code_object_load_data_t; using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; using kernel_symbol_map_t = std::unordered_map>; +using Instruction = rocprofiler::codeobj::disassembly::Instruction; +using CodeobjAddressTranslate = rocprofiler::codeobj::disassembly::CodeobjAddressTranslate; + rocprofiler_client_id_t* client_id = nullptr; rocprofiler_client_finalize_t client_fini_func = nullptr; rocprofiler_context_id_t client_ctx = {}; @@ -92,28 +115,41 @@ tool_codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record, if(std::string_view(data->uri).find("file:///") == 0) { codeobjTranslate.addDecoder( - data->uri, data->code_object_id, data->load_base, data->load_size); - auto symbolmap = codeobjTranslate.getSymbolMap(); - for(auto& [vaddr, symbol] : symbolmap) - registered_kernels.insert({symbol.name, {vaddr, vaddr + symbol.mem_size}}); + data->uri, data->code_object_id, data->load_delta, data->load_size); } + else if(COPY_MEMORY_CODEOBJ) + { + codeobjTranslate.addDecoder(reinterpret_cast(data->memory_base), + data->memory_size, + data->code_object_id, + data->load_delta, + data->load_size); + } + else + { + return; + } + + auto symbolmap = codeobjTranslate.getSymbolMap(data->code_object_id); + for(auto& [vaddr, symbol] : symbolmap) + registered_kernels.insert({symbol.name, {vaddr, vaddr + symbol.mem_size}}); } else if(record.operation == ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER) { - std::cout << std::hex; + output_stream() << std::hex; auto* data = static_cast(record.payload); auto kernel_name = std::regex_replace(data->kernel_name, std::regex{"(\\.kd)$"}, ""); if(registered_kernels.find(kernel_name) == registered_kernels.end()) { - std::cout << "Not Found: " << kernel_name << " in codeobj." << std::endl; + output_stream() << "Not Found: " << kernel_name << " in codeobj." << std::endl; return; } auto& begin_end = registered_kernels.at(kernel_name); - std::cout << std::hex << "Found: " << kernel_name << " at addr: 0x" << begin_end.first - << std::dec << ". Printing first 64 bytes:" << std::endl; + output_stream() << std::hex << "Found: " << kernel_name << " at addr: 0x" << begin_end.first + << std::dec << ". Printing first 64 bytes:" << std::endl; std::unordered_set references{}; int num_waitcnts = 0; @@ -127,7 +163,7 @@ tool_codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record, auto inst = codeobjTranslate.get(vaddr); std::string_view source = inst->comment; if(source.rfind('/') < source.size()) source = source.substr(source.rfind('/')); - if(vaddr < begin_end.first + 64) std::cout << '\t' << inst->inst << '\n'; + if(vaddr < begin_end.first + 64) output_stream() << '\t' << inst->inst << '\n'; if(source.rfind(':') < source.size()) source = source.substr(0, source.rfind(':')); @@ -144,12 +180,13 @@ tool_codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record, vaddr += inst->size; } - std::cout << " --- Num Scalar: " << num_scalar << "\n --- Num Vector: " << num_vector - << "\n --- Num Waitcnts: " << num_waitcnts - << "\n --- Other instructions: " << num_other - << "\nKernel has source references to: " << std::endl; + output_stream() << " --- Num Scalar: " << num_scalar + << "\n --- Num Vector: " << num_vector + << "\n --- Num Waitcnts: " << num_waitcnts + << "\n --- Other instructions: " << num_other + << "\nKernel has source references to: " << std::endl; for(auto& ref : references) - std::cout << '\t' << ref << std::endl; + output_stream() << '\t' << ref << std::endl; } (void) user_data; @@ -195,20 +232,8 @@ tool_fini(void* tool_data) (void) tool_data; } -void -setup() -{ - if(int status = 0; - rocprofiler_is_initialized(&status) == ROCPROFILER_STATUS_SUCCESS && status == 0) - { - ROCPROFILER_CALL(rocprofiler_force_configure(&rocprofiler_configure), - "force configuration"); - } -} } // namespace -// force configuration when library is loaded -bool cfg_on_load = (client::setup(), true); } // namespace client extern "C" rocprofiler_tool_configure_result_t* diff --git a/source/include/CMakeLists.txt b/source/include/CMakeLists.txt index d0c7e1ba82..8b8d02481e 100644 --- a/source/include/CMakeLists.txt +++ b/source/include/CMakeLists.txt @@ -5,3 +5,4 @@ set(CMAKE_INSTALL_DEFAULT_COMPONENT_NAME "development") add_subdirectory(rocprofiler-sdk) add_subdirectory(rocprofiler-sdk-roctx) +add_subdirectory(rocprofiler-sdk-codeobj) diff --git a/source/include/rocprofiler-sdk-codeobj/CMakeLists.txt b/source/include/rocprofiler-sdk-codeobj/CMakeLists.txt new file mode 100644 index 0000000000..cdfed40b41 --- /dev/null +++ b/source/include/rocprofiler-sdk-codeobj/CMakeLists.txt @@ -0,0 +1,24 @@ +# ############################################################################## +# # Copyright (c) 2024 Advanced Micro Devices, Inc. # # Permission is hereby +# granted, free of charge, to any person obtaining a copy # of this software and +# associated documentation files (the "Software"), to # deal in the Software +# without restriction, including without limitation the # rights to use, copy, +# modify, merge, publish, distribute, sublicense, and/or # sell copies of the +# Software, and to permit persons to whom the Software is # furnished to do so, +# subject to the following conditions: # # The above copyright notice and this +# permission notice shall be included in # all copies or substantial portions of +# the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY +# KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO +# EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +# OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE +# USE OR OTHER DEALINGS # IN THE SOFTWARE. +# ############################################################################## + +set(CODEOBJ_PARSER_HEADERS code_printing.hpp disassembly.hpp segment.hpp) + +install( + FILES ${CODEOBJ_PARSER_HEADERS} + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rocprofiler-sdk-codeobj + COMPONENT development) diff --git a/source/lib/rocprofiler-sdk-codeobj/code_printing.hpp b/source/include/rocprofiler-sdk-codeobj/code_printing.hpp similarity index 95% rename from source/lib/rocprofiler-sdk-codeobj/code_printing.hpp rename to source/include/rocprofiler-sdk-codeobj/code_printing.hpp index 8183a182aa..98ff32044f 100644 --- a/source/lib/rocprofiler-sdk-codeobj/code_printing.hpp +++ b/source/include/rocprofiler-sdk-codeobj/code_printing.hpp @@ -28,10 +28,15 @@ #include #include #include - #include "disassembly.hpp" #include "segment.hpp" +namespace rocprofiler +{ +namespace codeobj +{ +namespace disassembly +{ struct Instruction { Instruction() = default; @@ -59,9 +64,15 @@ struct DSourceLine class CodeobjDecoderComponent { public: - CodeobjDecoderComponent(const char* codeobj_data, uint64_t codeobj_size); + CodeobjDecoderComponent(const void* codeobj_data, uint64_t codeobj_size); ~CodeobjDecoderComponent(); + std::optional va2fo(uint64_t vaddr) + { + if(disassembly) return disassembly->va2fo(vaddr); + return {}; + }; + std::shared_ptr disassemble_instruction(uint64_t faddr, uint64_t vaddr); int m_fd; @@ -160,7 +171,7 @@ protected: /** * @brief Translates virtual addresses to elf file offsets */ -class CodeobjAddressTranslate : protected CodeobjMap +class CodeobjAddressTranslate : public CodeobjMap { using Super = CodeobjMap; @@ -255,3 +266,7 @@ public: private: CodeobjTableTranslator table; }; + +} // namespace disassembly +} // namespace codeobj +} // namespace rocprofiler diff --git a/source/lib/rocprofiler-sdk-codeobj/disassembly.hpp b/source/include/rocprofiler-sdk-codeobj/disassembly.hpp similarity index 94% rename from source/lib/rocprofiler-sdk-codeobj/disassembly.hpp rename to source/include/rocprofiler-sdk-codeobj/disassembly.hpp index 4e06442db6..c244d7dc29 100644 --- a/source/lib/rocprofiler-sdk-codeobj/disassembly.hpp +++ b/source/include/rocprofiler-sdk-codeobj/disassembly.hpp @@ -28,6 +28,12 @@ #include #include +namespace rocprofiler +{ +namespace codeobj +{ +namespace disassembly +{ class CodeObjectBinary { public: @@ -66,3 +72,7 @@ public: amd_comgr_data_t data; std::map symbol_map; }; + +} // namespace disassembly +} // namespace codeobj +} // namespace rocprofiler diff --git a/source/lib/rocprofiler-sdk-codeobj/segment.hpp b/source/include/rocprofiler-sdk-codeobj/segment.hpp similarity index 100% rename from source/lib/rocprofiler-sdk-codeobj/segment.hpp rename to source/include/rocprofiler-sdk-codeobj/segment.hpp diff --git a/source/include/rocprofiler-sdk/fwd.h b/source/include/rocprofiler-sdk/fwd.h index 3d2e0bf29b..0216b47e8a 100644 --- a/source/include/rocprofiler-sdk/fwd.h +++ b/source/include/rocprofiler-sdk/fwd.h @@ -92,6 +92,8 @@ typedef enum // NOLINT(performance-enum-size) ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL, ///< A service depends on a newer version of KFD ///< (amdgpu kernel driver). Check logs for ///< service that report incompatibility + ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES, ///< The given resources are + ///< insufficient to complete operation ROCPROFILER_STATUS_ERROR_PROFILE_NOT_FOUND, ///< Could not find the counter profile ROCPROFILER_STATUS_ERROR_AGENT_DISPATCH_CONFLICT, ///< Cannot enable both agent and dispatch ///< counting in the same context. diff --git a/source/include/rocprofiler-sdk/thread_trace.h b/source/include/rocprofiler-sdk/thread_trace.h index 1cf9dd66ce..c961b828dc 100644 --- a/source/include/rocprofiler-sdk/thread_trace.h +++ b/source/include/rocprofiler-sdk/thread_trace.h @@ -36,42 +36,23 @@ ROCPROFILER_EXTERN_C_INIT * @{ */ -typedef void (*rocprofiler_att_data_types_callback_t)(int64_t data_type_id, - const char* data_type_name, - void* userdata); - -rocprofiler_status_t -rocprofiler_att_iterate_data_types(rocprofiler_att_data_types_callback_t callback, - void* userdata) ROCPROFILER_API; - -typedef union +typedef enum { - struct - { - uint32_t enable_async_queue : 1; - uint32_t enable_occupancy_mode : 1; - uint32_t enable_double_buffering : 1; - uint32_t disable_att_markers : 1; - uint32_t disable_software_header : 1; - }; - uint32_t raw; -} rocprofiler_att_parameter_flag_t; + ROCPROFILER_ATT_PARAMETER_TARGET_CU = 0, + ROCPROFILER_ATT_PARAMETER_SHADER_ENGINE_MASK, + ROCPROFILER_ATT_PARAMETER_BUFFER_SIZE, + ROCPROFILER_ATT_PARAMETER_SIMD_SELECT, + ROCPROFILER_ATT_PARAMETER_PERFCOUNTERS_CTRL, + ROCPROFILER_ATT_PARAMETER_PERFCOUNTER, + ROCPROFILER_ATT_PARAMETER_OCCUPANCY_MODE_ENABLE, + ROCPROFILER_ATT_PARAMETER_LAST +} rocprofiler_att_parameter_type_t; typedef struct { - rocprofiler_att_parameter_flag_t flags; - int shader_num; - int* shader_ids; - uint64_t buffer_size; - uint8_t target_cu; - uint8_t simd_select; - uint8_t reserved; - uint8_t vmid_mask; - uint16_t perfcounter_mask; - uint8_t perfcounter_ctrl; - uint8_t perfcounter_num; - const char** perfcounter; -} rocprofiler_att_parameters_t; + rocprofiler_att_parameter_type_t type; + uint64_t value; +} rocprofiler_att_parameter_t; typedef enum { @@ -89,20 +70,111 @@ typedef rocprofiler_att_control_flags_t (*rocprofiler_att_dispatch_callback_t)( uint64_t kernel_id, void* userdata); -typedef void (*rocprofiler_att_shader_data_callback_t)(int64_t shader_engine_id, - int64_t data_type_id, - const char* data_type_name, - void* data, - size_t data_size, - void* userdata); +typedef void (*rocprofiler_att_shader_data_callback_t)(int64_t shader_engine_id, + void* data, + size_t data_size, + void* userdata); rocprofiler_status_t rocprofiler_configure_thread_trace_service(rocprofiler_context_id_t context_id, - rocprofiler_att_parameters_t parameters, + rocprofiler_att_parameter_t* parameters, + size_t num_parameters, rocprofiler_att_dispatch_callback_t dispatch_callback, rocprofiler_att_shader_data_callback_t shader_callback, void* callback_userdata) ROCPROFILER_API; +/** + * @brief Callback for rocprofiler to parsed ATT data. + * The caller must copy a desired instruction on isa_instruction and source_reference, + * while obeying the max length passed by the caller. + * If the caller's length is insufficient, then this function writes the minimum sizes to isa_size + * and source_size and returns ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES. + * If call returns ROCPROFILER_STATUS_SUCCESS, isa_size and source_size are written with bytes used. + * @param[out] isa_instruction Where to copy the ISA line to. + * @param[out] isa_memory_size (Auto) The number of bytes to next instruction. 0 for custom ISA. + * @param[inout] isa_size Size of returned ISA string. + * @param[in] marker_id The generated ATT marker for given codeobject ID. + * @param[in] offset The offset from base vaddr for given codeobj ID. + * If marker_id == 0, this parameter is raw virtual address with no codeobj ID information. + * @param[in] userdata Arbitrary data pointer to be sent back to the user via callback. + * @retval ROCPROFILER_STATUS_SUCCESS on success. + * @retval ROCPROFILER_STATUS_ERROR on generic error. + * @retval ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT for invalid offset or invalid marker_id. + * @retval ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES for insufficient isa_size or source_size. + */ +typedef rocprofiler_status_t (*rocprofiler_att_parser_isa_callback_t)(char* isa_instruction, + uint64_t* isa_memory_size, + uint64_t* isa_size, + uint64_t marker_id, + uint64_t offset, + void* userdata); + +/** + * @brief Callback for the ATT parser to retrieve Shader Engine data. + * Returns the amount of data filled. If no more data is available, then callback return 0 + * If the space available in the buffer is less than required for parsing the full data, + * the full data is transfered over multiple calls. + * When all data has been transfered from current shader_engine_id, the caller has the option to + * 1) Return -1 on shader_engine ID and parsing terminates + * 2) Move to the next shader engine. + * @param[out] shader_engine_id The ID of given shader engine. + * @param[out] buffer The buffer to fill up with SE data. + * @param[out] buffer_size The space available in the buffer. + * @param[in] userdata Arbitrary data pointer to be sent back to the user via callback. + * @returns Number of bytes remaining in shader engine. + * @retval 0 if no more SE data is available. Parsing will stop. + * @retval buffer_size if the buffer does not hold enough data for the current shader engine. + * @retval 0 > ret > buffer_size for partially filled buffer, and caller moves over to next SE. + */ +typedef uint64_t (*rocprofiler_att_parser_se_data_callback_t)(int* shader_engine_id, + uint8_t** buffer, + uint64_t* buffer_size, + void* userdata); + +typedef enum +{ + ROCPROFILER_ATT_PARSER_DATA_TYPE_ISA = 0, + ROCPROFILER_ATT_PARSER_DATA_TYPE_OCCUPANCY, +} rocprofiler_att_parser_data_type_t; + +typedef struct +{ + uint64_t marker_id; + uint64_t offset; + uint64_t hitcount; + uint64_t latency; +} rocprofiler_att_data_type_isa_t; + +typedef struct +{ + uint64_t marker_id; + uint64_t offset; + uint64_t timestamp : 63; + uint64_t enabled : 1; +} rocprofiler_att_data_type_occupancy_t; + +/** + * @brief Callback for rocprofiler to return traces back to rocprofiler. + * @param[in] att_data A datapoint retrieved from thread_trace + * @param[in] userdata Arbitrary data pointer to be sent back to the user via callback. + */ +typedef void (*rocprofiler_att_parser_trace_callback_t)(rocprofiler_att_parser_data_type_t type, + void* att_data, + void* userdata); + +/** + * @brief Iterate over all event coordinates for a given agent_t and event_t. + * @param[in] se_data_callback Callback to return shader engine data from. + * @param[in] trace_callback Callback where the trace data is returned to. + * @param[in] isa_callback Callback to return ISA lines. + * @param[in] userdata Userdata passed back to caller via callback. + */ +rocprofiler_status_t +rocprofiler_att_parse_data(rocprofiler_att_parser_se_data_callback_t se_data_callback, + rocprofiler_att_parser_trace_callback_t trace_callback, + rocprofiler_att_parser_isa_callback_t isa_callback, + void* userdata); + /** @} */ ROCPROFILER_EXTERN_C_FINI diff --git a/source/lib/rocprofiler-sdk-codeobj/CMakeLists.txt b/source/lib/rocprofiler-sdk-codeobj/CMakeLists.txt index 6f45bd1dbd..59b78ea3d9 100644 --- a/source/lib/rocprofiler-sdk-codeobj/CMakeLists.txt +++ b/source/lib/rocprofiler-sdk-codeobj/CMakeLists.txt @@ -16,19 +16,17 @@ # USE OR OTHER DEALINGS # IN THE SOFTWARE. # ############################################################################## -set(CMAKE_INSTALL_DEFAULT_COMPONENT_NAME "codeobj") +set(CMAKE_INSTALL_DEFAULT_COMPONENT_NAME "development") -set(LIB_CODEOBJ_PARSER_SOURCES code_object_track.cpp code_printing.cpp disassembly.cpp) -set(LIB_CODEOBJ_PARSER_HEADERS code_object_track.hpp code_printing.hpp disassembly.hpp - segment.hpp) +set(CODEOBJ_PARSER_SOURCES code_printing.cpp disassembly.cpp) add_library(rocprofiler-sdk-codeobj STATIC) -target_sources(rocprofiler-sdk-codeobj PRIVATE ${LIB_CODEOBJ_PARSER_SOURCES}) +target_sources(rocprofiler-sdk-codeobj PRIVATE ${CODEOBJ_PARSER_SOURCES}) target_link_libraries( rocprofiler-sdk-codeobj PRIVATE rocprofiler::rocprofiler-amd-comgr rocprofiler::rocprofiler-dw rocprofiler::rocprofiler-elf rocprofiler::rocprofiler-build-flags - rocprofiler::rocprofiler-memcheck rocprofiler::rocprofiler-common-library) + rocprofiler::rocprofiler-common-library) target_include_directories(rocprofiler-sdk-codeobj PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) @@ -40,11 +38,9 @@ set_target_properties( install( TARGETS rocprofiler-sdk-codeobj - DESTINATION ${CMAKE_INSTALL_LIBDIR}/rocprofiler-sdk - COMPONENT tools + DESTINATION ${CMAKE_INSTALL_LIBDIR} EXPORT rocprofiler-sdk-codeobj-targets) -install( - FILES ${LIB_CODEOBJ_PARSER_HEADERS} - DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rocprofiler-sdk-codeobj - COMPONENT tools) +if(ROCPROFILER_BUILD_TESTS) + add_subdirectory(tests) +endif() diff --git a/source/lib/rocprofiler-sdk-codeobj/code_object_track.cpp b/source/lib/rocprofiler-sdk-codeobj/code_object_track.cpp deleted file mode 100644 index 5ce74d62f8..0000000000 --- a/source/lib/rocprofiler-sdk-codeobj/code_object_track.cpp +++ /dev/null @@ -1,196 +0,0 @@ -// MIT License -// -// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "code_object_track.hpp" - -void -CodeobjRecorder::Load(uint64_t addr, - uint64_t load_size, - const std::string& URI, - uint64_t mem_addr, - uint64_t mem_size, - uint64_t id) -{ - Load(std::make_shared( - addr, load_size, URI, mem_addr, mem_size, id, capture_mode)); -} - -void -CodeobjCaptureInstance::copyCodeobjFromFile(uint64_t offset, uint64_t size, const std::string& path) -{ - std::ifstream file(path, std::ios::in | std::ios::binary); - if(!file) - { - printf("could not open `%s'\n", path.c_str()); - return; - } - - if(!size) - { - file.ignore(std::numeric_limits::max()); - size_t bytes = file.gcount(); - file.clear(); - - if(bytes < offset) - { - printf("invalid uri `%s' (file size < offset)\n", path.c_str()); - return; - } - size = bytes - offset; - } - - file.seekg(offset, std::ios_base::beg); - buffer.resize(size); - file.read(&buffer[0], size); -} - -void CodeobjCaptureInstance::copyCodeobjFromMemory(uint64_t, uint64_t) -{ - // buffer.resize(mem_size); - // std::memcpy(buffer.data(), (uint64_t*)mem_addr, mem_size); -} - -std::pair -CodeobjCaptureInstance::parse_uri() -{ - const std::string protocol_delim{"://"}; - - size_t protocol_end = URI.find(protocol_delim); - protocol = URI.substr(0, protocol_end); - protocol_end += protocol_delim.length(); - - std::transform(protocol.begin(), protocol.end(), protocol.begin(), [](unsigned char c) { - return std::tolower(c); - }); - - std::string path; - size_t path_end = URI.find_first_of("#?", protocol_end); - if(path_end != std::string::npos) - { - path = URI.substr(protocol_end, path_end++ - protocol_end); - } - else - { - path = URI.substr(protocol_end); - } - - /* %-decode the string. */ - decoded_path = std::string{}; - decoded_path.reserve(path.length()); - for(size_t i = 0; i < path.length(); ++i) - { - if(path[i] == '%' && std::isxdigit(path[i + 1]) && std::isxdigit(path[i + 2])) - { - decoded_path += std::stoi(path.substr(i + 1, 2), 0, 16); - i += 2; - } - else - { - decoded_path += path[i]; - } - } - - /* Tokenize the query/fragment. */ - std::vector tokens; - size_t pos, last = path_end; - while((pos = URI.find('&', last)) != std::string::npos) - { - tokens.emplace_back(URI.substr(last, pos - last)); - last = pos + 1; - } - if(last != std::string::npos) tokens.emplace_back(URI.substr(last)); - - /* Create a tag-value map from the tokenized query/fragment. */ - std::unordered_map params; - std::for_each(tokens.begin(), tokens.end(), [&](std::string& token) { - size_t delim = token.find('='); - if(delim != std::string::npos) - { - params.emplace(token.substr(0, delim), token.substr(delim + 1)); - } - }); - - size_t offset = 0; - size_t size = 0; - - if(auto offset_it = params.find("offset"); offset_it != params.end()) - offset = std::stoul(offset_it->second, nullptr, 0); - - if(auto size_it = params.find("size"); size_it != params.end()) - { - if(!(size = std::stoul(size_it->second, nullptr, 0))) throw std::exception(); - } - - return {offset, size}; -} - -void -CodeobjCaptureInstance::reset(codeobj_capture_mode_t mode) -{ - if(static_cast(mode) <= static_cast(capture_mode)) return; - - capture_mode = mode; - if(!buffer.empty()) return; - - size_t offset, size; - try - { - std::tie(offset, size) = parse_uri(); - } catch(...) - { - std::cerr << "Error parsing URI " << URI << std::endl; - return; - } - - if(protocol == "file") - { - if(mode == ROCPROFILER_CODEOBJ_CAPTURE_COPY_FILE_AND_MEMORY) - copyCodeobjFromFile(offset, size, decoded_path); - } - else if(protocol == "memory") - { - if(mode != ROCPROFILER_CODEOBJ_CAPTURE_SYMBOLS_ONLY) - copyCodeobjFromMemory(mem_addr, mem_size); - } - else - { - printf("\"%s\" protocol not supported\n", protocol.c_str()); - } -} diff --git a/source/lib/rocprofiler-sdk-codeobj/code_object_track.hpp b/source/lib/rocprofiler-sdk-codeobj/code_object_track.hpp deleted file mode 100644 index f279b004b4..0000000000 --- a/source/lib/rocprofiler-sdk-codeobj/code_object_track.hpp +++ /dev/null @@ -1,158 +0,0 @@ -// MIT License -// -// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#pragma once - -#include -#include -#include -#include -#include -#include - -/** - * Enum defines how code object is captured for ATT and PC Sampling - */ -enum codeobj_capture_mode_t -{ - /** - * Capture file and memory paths for the loaded code object - */ - ROCPROFILER_CODEOBJ_CAPTURE_SYMBOLS_ONLY = 0, - /** - * Capture symbols for file:// and memory:// type objects, - * and generate a copy of all kernel code for objects under memory:// - */ - ROCPROFILER_CODEOBJ_CAPTURE_COPY_MEMORY = 1, - /** - * Capture symbols and all kernel code for file:// and memory:// type objects - */ - ROCPROFILER_CODEOBJ_CAPTURE_COPY_FILE_AND_MEMORY = 2, - ROCPROFILER_CODEOBJ_CAPTURE_LAST = 3, -}; - -/** - * A class to keep track of currently loaded code objects. - * Only the public static methods are thread-safe and expected to be used. - */ -class CodeobjCaptureInstance -{ -public: - CodeobjCaptureInstance(uint64_t _addr, - uint64_t _load_size, - const std::string& _uri, - uint64_t _mem_addr, - uint64_t _mem_size, - uint64_t id, - codeobj_capture_mode_t mode) - : addr(_addr) - , load_size(_load_size) - , load_id(id) - , URI(_uri) - , mem_addr(_mem_addr) - , mem_size(_mem_size) - { - reset(mode); - }; - - const uint64_t addr; - const uint64_t load_size; - const uint64_t load_id; - -private: - void reset(codeobj_capture_mode_t mode); - - std::pair parse_uri(); - void DecodePath(); - void copyCodeobjFromFile(uint64_t offset, uint64_t size, const std::string& path); - void copyCodeobjFromMemory(uint64_t, uint64_t); - - std::string URI{}; - std::string decoded_path{}; - std::string protocol{}; - std::vector buffer{}; - - uint64_t mem_addr = 0; - uint64_t mem_size = 0; - codeobj_capture_mode_t capture_mode = ROCPROFILER_CODEOBJ_CAPTURE_SYMBOLS_ONLY; -}; - -typedef std::shared_ptr CodeobjPtr; - -template <> -struct std::hash -{ - uint64_t operator()(const CodeobjPtr& p) const { return p->load_id; } -}; - -template <> -struct std::equal_to -{ - bool operator()(const CodeobjPtr& a, const CodeobjPtr& b) const - { - return (a->addr == b->addr) && (a->load_id == b->load_id); - }; -}; - -/** - * A class to keep track of the history of loaded code objets. - * Only the public static methods are thread-safe and expected to be used. - */ -class CodeobjRecorder -{ -public: - CodeobjRecorder(codeobj_capture_mode_t mode) - : capture_mode(mode){}; - - void Load(uint64_t _addr, - uint64_t _load_size, - const std::string& _uri, - uint64_t mem_addr, - uint64_t mem_size, - uint64_t id); - void Load(CodeobjPtr capture) - { - std::lock_guard lk(mutex); - captures[capture->load_id] = capture; - } - void Unload(uint64_t id) - { - std::lock_guard lk(mutex); - captures.erase(id); - }; - -public: - std::shared_mutex mutex; - - std::vector get() - { - std::vector vec; - std::shared_lock lk(mutex); - for(auto& [k, v] : captures) - vec.push_back(v); - return vec; - }; - -private: - codeobj_capture_mode_t capture_mode; - std::unordered_map captures; -}; diff --git a/source/lib/rocprofiler-sdk-codeobj/code_printing.cpp b/source/lib/rocprofiler-sdk-codeobj/code_printing.cpp index cdda075ece..a93c5c7b8b 100644 --- a/source/lib/rocprofiler-sdk-codeobj/code_printing.cpp +++ b/source/lib/rocprofiler-sdk-codeobj/code_printing.cpp @@ -20,8 +20,6 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include "lib/rocprofiler-sdk-codeobj/code_printing.hpp" - #include #include #include @@ -35,6 +33,8 @@ #include #include +#include + #include #include #include @@ -72,7 +72,13 @@ } \ catch(...) { return returndata; } -CodeobjDecoderComponent::CodeobjDecoderComponent(const char* codeobj_data, uint64_t codeobj_size) +namespace rocprofiler +{ +namespace codeobj +{ +namespace disassembly +{ +CodeobjDecoderComponent::CodeobjDecoderComponent(const void* codeobj_data, uint64_t codeobj_size) { m_fd = -1; #if defined(_GNU_SOURCE) && defined(MFD_ALLOW_SEALING) && defined(MFD_CLOEXEC) @@ -87,7 +93,7 @@ CodeobjDecoderComponent::CodeobjDecoderComponent(const char* codeobj_data, uint6 return; } - if(size_t size = ::write(m_fd, codeobj_data, codeobj_size); size != codeobj_size) + if(size_t size = ::write(m_fd, (const char*) codeobj_data, codeobj_size); size != codeobj_size) { printf("could not write to the temporary file\n"); return; @@ -150,7 +156,7 @@ CodeobjDecoderComponent::CodeobjDecoderComponent(const char* codeobj_data, uint6 } // Can throw - disassembly = std::make_unique(codeobj_data, codeobj_size); + disassembly = std::make_unique((const char*) codeobj_data, codeobj_size); if(m_line_number_map.size()) { size_t total_size = 0; @@ -250,7 +256,7 @@ LoadedCodeobjDecoder::add_to_map(uint64_t ld_addr) if(!decoder || ld_addr < load_addr) throw std::out_of_range("Addr not in decoder"); uint64_t voffset = ld_addr - load_addr; - auto faddr = decoder->disassembly->va2fo(voffset); + auto faddr = decoder->va2fo(voffset); if(!faddr) throw std::out_of_range("Could not find file offset"); auto shared = decoder->disassemble_instruction(*faddr, voffset); @@ -275,4 +281,6 @@ LoadedCodeobjDecoder::get(uint64_t addr) return nullptr; } -#define PUBLIC_API __attribute__((visibility("default"))) +} // namespace disassembly +} // namespace codeobj +} // namespace rocprofiler diff --git a/source/lib/rocprofiler-sdk-codeobj/disassembly.cpp b/source/lib/rocprofiler-sdk-codeobj/disassembly.cpp index a184c72127..ab4fa6abbb 100644 --- a/source/lib/rocprofiler-sdk-codeobj/disassembly.cpp +++ b/source/lib/rocprofiler-sdk-codeobj/disassembly.cpp @@ -50,9 +50,8 @@ #include #include -#include #include -#include "lib/rocprofiler-sdk-codeobj/code_printing.hpp" +#include #define THROW_COMGR(call) \ if(amd_comgr_status_s status = call) \ @@ -74,6 +73,12 @@ return AMD_COMGR_STATUS_ERROR; \ } +namespace rocprofiler +{ +namespace codeobj +{ +namespace disassembly +{ CodeObjectBinary::CodeObjectBinary(const std::string& uri) : m_uri(uri) { @@ -151,7 +156,7 @@ CodeObjectBinary::CodeObjectBinary(const std::string& uri) if(!(size = std::stoul(size_it->second, nullptr, 0))) return; } - if(protocol != "file") throw protocol + " protocol not supported!"; + if(protocol == "memory") throw protocol + " protocol not supported!"; std::ifstream file(decoded_path, std::ios::in | std::ios::binary); if(!file || !file.is_open()) throw "could not open " + decoded_path; @@ -370,3 +375,7 @@ DisassemblyInstance::getSegments() return segments; } + +} // namespace disassembly +} // namespace codeobj +} // namespace rocprofiler diff --git a/source/lib/rocprofiler-sdk-codeobj/tests/CMakeLists.txt b/source/lib/rocprofiler-sdk-codeobj/tests/CMakeLists.txt new file mode 100644 index 0000000000..ccc609d265 --- /dev/null +++ b/source/lib/rocprofiler-sdk-codeobj/tests/CMakeLists.txt @@ -0,0 +1,33 @@ +rocprofiler_deactivate_clang_tidy() + +include(GoogleTest) +add_executable(codeobj-library-test) + +set(CODEOBJ_LIB_TEST_SOURCES "codeobj_library_test.cpp") +target_sources(codeobj-library-test PRIVATE ${CODEOBJ_LIB_TEST_SOURCES}) + +target_link_libraries( + codeobj-library-test + PRIVATE rocprofiler::rocprofiler-static-library + rocprofiler::rocprofiler-glog + rocprofiler::rocprofiler-hsa-runtime + rocprofiler::rocprofiler-hip + rocprofiler::rocprofiler-common-library + GTest::gtest + GTest::gtest_main + rocprofiler-sdk-codeobj) + +gtest_add_tests( + TARGET codeobj-library-test + SOURCES ${CODEOBJ_LIB_TEST_SOURCES} + TEST_LIST codeobj-library-test_TESTS + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + +set_tests_properties(${codeobj-library-test_TESTS} PROPERTIES TIMEOUT 10 LABELS + "unittests") + +target_compile_definitions(codeobj-library-test + PRIVATE -DCODEOBJ_BINARY_DIR=\"${CMAKE_CURRENT_BINARY_DIR}/\") + +configure_file(smallkernel.b smallkernel.b COPYONLY) +configure_file(hipcc_output.s hipcc_output.s COPYONLY) diff --git a/source/lib/rocprofiler-sdk-codeobj/tests/codeobj_library_test.cpp b/source/lib/rocprofiler-sdk-codeobj/tests/codeobj_library_test.cpp new file mode 100644 index 0000000000..ea6acb3eb1 --- /dev/null +++ b/source/lib/rocprofiler-sdk-codeobj/tests/codeobj_library_test.cpp @@ -0,0 +1,259 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include +#include +#include +#include +#include +#include + +#ifndef CODEOBJ_BINARY_DIR +static_assert(false && "Please define CODEOBJ_BINARY_DIR to codeobj tests binary, " + "e.g. ../source/lib/rocprofiler-sdk-codeobj/tests/"); +#endif + +namespace rocprofiler +{ +namespace testing +{ +namespace codeobjhelper +{ +std::string +removeNull(std::string_view s) +{ + std::string u(s); + while(u.find("null") != std::string::npos) + u = u.substr(0, u.find("null")) + "0x0" + u.substr(u.find("null") + 4); + return u; +} + +static const std::vector& +GetHipccOutput() +{ + static std::vector result = []() { + std::ifstream file(CODEOBJ_BINARY_DIR "hipcc_output.s"); + std::vector ret; + + while(file.good()) + { + std::string s; + getline(file, s); + ret.push_back(removeNull(s)); + } + return ret; + }(); + return result; +} + +static const std::vector& +GetCodeobjContents() +{ + static std::vector buffer = []() { + std::string filename = CODEOBJ_BINARY_DIR "smallkernel.b"; + std::ifstream file(filename.data(), std::ios::binary); + + using iterator_t = std::istreambuf_iterator; + return std::vector(iterator_t(file), iterator_t()); + }(); + return buffer; +} + +} // namespace codeobjhelper +} // namespace testing +} // namespace rocprofiler + +TEST(codeobj_library, segment_test) +{ + CodeobjTableTranslator table; + std::unordered_set used_addr{}; + + for(size_t ITER = 0; ITER < 50; ITER++) + { + for(int j = 0; j < 2500; j++) + { + size_t addr = rand() % 10000000; + size_t size = (rand() % 10) + 1; + if(used_addr.find(addr) != used_addr.end()) continue; + used_addr.insert(addr); + table.insert({addr, addr + size, 0, 0}); + } + + for(size_t i = 1; i < table.size(); i++) + ASSERT_LT(table[i - 1], table[i]); + + for(size_t i = 0; i < 2400; i++) + { + size_t idx = rand() % table.size(); + auto rdelem = table[idx]; + used_addr.erase(rdelem.vbegin); + ASSERT_NE(table.remove(rdelem.vbegin), 0); + } + } +} + +namespace disassembly = rocprofiler::codeobj::disassembly; +namespace codeobjhelper = rocprofiler::testing::codeobjhelper; +using CodeobjDecoderComponent = rocprofiler::codeobj::disassembly::CodeobjDecoderComponent; +using LoadedCodeobjDecoder = rocprofiler::codeobj::disassembly::LoadedCodeobjDecoder; + +TEST(codeobj_library, file_opens) +{ + ASSERT_NE(codeobjhelper::GetHipccOutput().size(), 0); + ASSERT_NE(codeobjhelper::GetCodeobjContents().size(), 0); +} + +TEST(codeobj_library, decoder_component) +{ + const std::vector& hiplines = codeobjhelper::GetHipccOutput(); + const std::vector& objdata = codeobjhelper::GetCodeobjContents(); + constexpr size_t loaded_offset = 0x3000; + + CodeobjDecoderComponent component(objdata.data(), objdata.size()); + + std::string kernel_with_protocol = "file://" CODEOBJ_BINARY_DIR "smallkernel.b"; + LoadedCodeobjDecoder loadecomp(kernel_with_protocol.data(), loaded_offset, objdata.size()); + + ASSERT_EQ(component.m_symbol_map.size(), 1); + + for(auto& [kaddr, symbol] : component.m_symbol_map) + { + ASSERT_NE(symbol.name.find("reproducible_runtime"), std::string::npos); + ASSERT_NE(symbol.mem_size, 0); + + size_t it = 0; + size_t vaddr = kaddr; + while(vaddr < kaddr + symbol.mem_size) + { + if(!component.va2fo(vaddr)) + { + ASSERT_NE(0, 0); + } + + uint64_t faddr = *component.va2fo(vaddr); + ASSERT_EQ(faddr - symbol.faddr, vaddr - kaddr); + + auto instruction = component.disassemble_instruction(faddr, vaddr); + auto loaded_instruction = loadecomp.get(vaddr + loaded_offset); + + ASSERT_NE(codeobjhelper::removeNull(instruction->inst).find(hiplines.at(it)), + std::string::npos); + ASSERT_EQ(instruction->inst, loaded_instruction->inst); + vaddr += instruction->size; + it++; + } + } +} + +TEST(codeobj_library, loaded_codeobj_component) +{ + const std::vector& objdata = rocprofiler::testing::codeobjhelper::GetCodeobjContents(); + constexpr size_t offset = 0x1000; + constexpr size_t memsize = 0x1000; + + LoadedCodeobjDecoder decoder((const void*) objdata.data(), objdata.size(), offset, memsize); + + for(auto& [kaddr, symbol] : decoder.getSymbolMap()) + { + ASSERT_NE(symbol.name.find("reproducible_runtime"), std::string::npos); + ASSERT_NE(symbol.mem_size, 0); + } +} + +TEST(codeobj_library, codeobj_map_test) +{ + const std::vector& objdata = rocprofiler::testing::codeobjhelper::GetCodeobjContents(); + constexpr size_t laddr1 = 0x1000; + constexpr size_t laddr3 = 0x3000; + + uint64_t kaddr = [&objdata]() { + CodeobjDecoderComponent comp((const void*) objdata.data(), objdata.size()); + for(auto& [addr, _] : comp.m_symbol_map) + return addr; + return 0ul; + }(); + + EXPECT_NE(kaddr, 0); + + disassembly::CodeobjMap map; + map.addDecoder((const void*) objdata.data(), + objdata.size(), + codeobj_marker_id_t{1}, + laddr1, + objdata.size()); + map.addDecoder((const void*) objdata.data(), + objdata.size(), + codeobj_marker_id_t{3}, + laddr3, + objdata.size()); + + EXPECT_EQ(map.get(codeobj_marker_id_t{1}, kaddr)->inst, + map.get(codeobj_marker_id_t{3}, kaddr)->inst); + + ASSERT_EQ(map.removeDecoderbyId(1), true); + ASSERT_EQ(map.removeDecoderbyId(3), true); + ASSERT_EQ(map.removeDecoderbyId(1), false); +} + +TEST(codeobj_library, codeobj_table_test) +{ + const std::vector& hiplines = codeobjhelper::GetHipccOutput(); + const std::vector& objdata = codeobjhelper::GetCodeobjContents(); + constexpr size_t laddr1 = 0x1000; + constexpr size_t laddr3 = 0x3000; + + disassembly::CodeobjAddressTranslate map; + + uint64_t kaddr = 0, memsize = 0; + std::tie(kaddr, memsize) = [&objdata]() { + CodeobjDecoderComponent comp((const void*) objdata.data(), objdata.size()); + for(auto& [addr, symbol] : comp.m_symbol_map) + return std::pair(addr, symbol.mem_size); + return std::pair(0, 0); + }(); + ASSERT_NE(kaddr, 0); + ASSERT_NE(memsize, 0); + + map.addDecoder( + (const void*) objdata.data(), objdata.size(), codeobj_marker_id_t{1}, laddr1, 0x2000); + map.addDecoder( + (const void*) objdata.data(), objdata.size(), codeobj_marker_id_t{3}, laddr3, 0x2000); + + EXPECT_NE(map.get(laddr1 + kaddr).get(), nullptr); + EXPECT_NE(map.get(laddr3 + kaddr).get(), nullptr); + EXPECT_EQ(map.get(laddr1 + kaddr)->inst, map.get(laddr3 + kaddr)->inst); + + size_t it = 0; + size_t vaddr = kaddr; + while(vaddr < kaddr + memsize) + { + auto instruction = map.get(laddr1 + vaddr); + ASSERT_NE(codeobjhelper::removeNull(instruction->inst).find(hiplines.at(it)), + std::string::npos); + vaddr += instruction->size; + it++; + } + + ASSERT_EQ(map.removeDecoderbyId(1), true); + ASSERT_EQ(map.removeDecoderbyId(3), true); + ASSERT_EQ(map.removeDecoderbyId(1), false); +} diff --git a/source/lib/rocprofiler-sdk-codeobj/tests/hipcc_output.s b/source/lib/rocprofiler-sdk-codeobj/tests/hipcc_output.s new file mode 100644 index 0000000000..04632d6cf5 --- /dev/null +++ b/source/lib/rocprofiler-sdk-codeobj/tests/hipcc_output.s @@ -0,0 +1,17 @@ +s_load_b64 s[0:1], s[0:1], 0x0 +s_getreg_b32 s2, hwreg(HW_REG_SHADER_CYCLES, 0, 20) +s_waitcnt vmcnt(0) lgkmcnt(0) +s_waitcnt_vscnt null, 0x0 +s_barrier +s_waitcnt vmcnt(0) lgkmcnt(0) +s_waitcnt_vscnt null, 0x0 +buffer_gl0_inv +s_getreg_b32 s3, hwreg(HW_REG_SHADER_CYCLES, 0, 20) +s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +s_sub_u32 s4, s3, s2 +s_subb_u32 s5, 0, 0 +v_cmp_lt_i64_e64 s3, s[4:5], s[0:1] +s_delay_alu instid0(VALU_DEP_1) +s_and_b32 vcc_lo, exec_lo, s3 +s_cbranch_vccnz 65520 +s_endpgm diff --git a/source/lib/rocprofiler-sdk-codeobj/tests/smallkernel.b b/source/lib/rocprofiler-sdk-codeobj/tests/smallkernel.b new file mode 100644 index 0000000000..2fbad68023 Binary files /dev/null and b/source/lib/rocprofiler-sdk-codeobj/tests/smallkernel.b differ diff --git a/source/lib/rocprofiler-sdk/aql/aql_profile_v2.h b/source/lib/rocprofiler-sdk/aql/aql_profile_v2.h index 22613115d0..2d6a6da1e6 100644 --- a/source/lib/rocprofiler-sdk/aql/aql_profile_v2.h +++ b/source/lib/rocprofiler-sdk/aql/aql_profile_v2.h @@ -1,11 +1,34 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + #pragma once #include #include -#define PUBLIC_API - +#ifdef __cplusplus extern "C" { +#endif + typedef struct { uint64_t handle; @@ -126,7 +149,7 @@ typedef struct * @retval HSA_STATUS_SUCCESS registration ok * @retval HSA_STATUS_ERROR registration failed */ -PUBLIC_API hsa_status_t +hsa_status_t aqlprofile_register_agent(aqlprofile_agent_handle_t* agent_id, const aqlprofile_agent_info_t* agent_info); @@ -156,7 +179,7 @@ typedef enum // counters disable command buffer } aqlprofile_pmc_info_type_t; -PUBLIC_API hsa_status_t +hsa_status_t aqlprofile_get_pmc_info(const aqlprofile_pmc_profile_t* profile, aqlprofile_pmc_info_type_t attribute, void* value); @@ -221,7 +244,7 @@ typedef hsa_status_t (*aqlprofile_memory_copy_t)(void* dst, * @retval HSA_STATUS_SUCCESS if the event was validated. * @retval HSA_STATUS_ERROR if the event was not validated. */ -PUBLIC_API hsa_status_t +hsa_status_t aqlprofile_validate_pmc_event(aqlprofile_agent_handle_t agent, const aqlprofile_pmc_event_t* event, bool* result); @@ -235,7 +258,7 @@ aqlprofile_validate_pmc_event(aqlprofile_agent_handle_t agent, * @retval HSA_STATUS_ERROR if some callback returns an error * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT if invalid handle is given */ -PUBLIC_API hsa_status_t +hsa_status_t aqlprofile_pmc_iterate_data(aqlprofile_handle_t handle, aqlprofile_pmc_data_callback_t callback, void* userdata); @@ -259,7 +282,7 @@ typedef struct * @param[in] dealloc_cb Function to free memory allocated by alloc_cb * @param[in] userdata Data passed back to user via memory alloc callback */ -PUBLIC_API hsa_status_t +hsa_status_t aqlprofile_pmc_create_packets(aqlprofile_handle_t* handle, aqlprofile_pmc_aql_packets_t* packets, aqlprofile_pmc_profile_t profile, @@ -272,7 +295,7 @@ aqlprofile_pmc_create_packets(aqlprofile_handle_t* handle, * @brief Function to delete AQL packets after creation by aqlprofile_pmc_create_packets * @param[in] handle Returned by aqlprofile_pmc_create_packets() */ -PUBLIC_API void +void aqlprofile_pmc_delete_packets(aqlprofile_handle_t handle); /** @@ -284,7 +307,7 @@ aqlprofile_pmc_delete_packets(aqlprofile_handle_t handle); * @retval HSA_STATUS_ERROR if some callback returns an error * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT if invalid handle is given */ -PUBLIC_API hsa_status_t +hsa_status_t aqlprofile_att_iterate_data(aqlprofile_handle_t handle, aqlprofile_att_data_callback_t callback, void* userdata); @@ -307,7 +330,7 @@ typedef struct * @retval HSA_STATUS_SUCCESS if all packets created succesfully * @retval HSA_STATUS_ERROR otherwise */ -PUBLIC_API hsa_status_t +hsa_status_t aqlprofile_att_create_packets(aqlprofile_handle_t* handle, aqlprofile_att_control_aql_packets_t* packets, aqlprofile_att_profile_t profile, @@ -316,7 +339,7 @@ aqlprofile_att_create_packets(aqlprofile_handle_t* handle, aqlprofile_memory_copy_t memcpy_cb, void* userdata); -PUBLIC_API void +void aqlprofile_att_delete_packets(aqlprofile_handle_t handle); /** @@ -453,20 +476,21 @@ enum WaveTrapStatus TRAP_STANDBY = 2 }; -struct __attribute__((packed)) pcinfo_t +typedef struct { size_t addr; - int marker_id; -}; + size_t marker_id; +} pcinfo_t; typedef struct __attribute__((packed)) { + uint64_t category : 8; + uint64_t hitcount : 56; + uint64_t latency; pcinfo_t pc; - int hitcount; - size_t latency; } att_trace_event_t; -struct wave_data_t +typedef struct { uint8_t simd; uint8_t wave_id; @@ -474,65 +498,42 @@ struct wave_data_t uint8_t reserved; // VMEM Pipeline: instrs and stalls - int num_vmem_instrs = 0; - int num_vmem_stalls = 0; + int num_vmem_instrs; + int num_vmem_stalls; // FLAT instrs and stalls - int num_flat_instrs = 0; - int num_flat_stalls = 0; + int num_flat_instrs; + int num_flat_stalls; // LDS instr and stalls - int num_lds_instrs = 0; - int num_lds_stalls = 0; + int num_lds_instrs; + int num_lds_stalls; // SCA instrs stalls - int num_salu_instrs = 0; - int num_smem_instrs = 0; - int num_salu_stalls = 0; - int num_smem_stalls = 0; + int num_salu_instrs; + int num_smem_instrs; + int num_salu_stalls; + int num_smem_stalls; // Branch - int num_branch_instrs = 0; - int num_branch_taken_instrs = 0; - int num_branch_stalls = 0; + int num_branch_instrs; + int num_branch_taken_instrs; + int num_branch_stalls; // total VMEM/FLAT/LDS/SMEM instructions issued - int num_mem_instrs = 0; // total issued memory instructions - int num_valu_stalls = 0; - size_t num_valu_instrs = 0; - size_t num_issued_instrs = 0; // total issued instructions (compute + memory) + int num_mem_instrs; // total issued memory instructions + int num_valu_stalls; + size_t num_valu_instrs; + size_t num_issued_instrs; // total issued instructions (compute + memory) - int64_t begin_time = 0; // Begin and end cycle - int64_t end_time = 0; - int64_t traceID = -1; + int64_t begin_time; // Begin and end cycle + int64_t end_time; + int64_t traceID; - size_t timeline_size = 0; - size_t instructions_size = 0; + size_t timeline_size; + size_t instructions_size; wave_state_t* timeline_array; wave_instruction_t* instructions_array; -}; - -/** - * @brief Callback for iteration of all possible event coordinate IDs and coordinate names. - * @param [in] id Integer identifying type ID. - * @param [in] name Name of the trace type. - * @param [in] userdata User data supplied to back caller - * @retval HSA_STATUS_SUCCESS Continues iteration - * @retval OTHERS Any other HSA return values stops iteration, passing back this value through - * @ref aqlprofile_iterate_trace_type_ids - */ -typedef hsa_status_t (*aqlprofile_att_tracename_callback_t)(int id, const char* name, void* data); - -/** - * @brief Iterate over all possible event coordinate IDs and their names. - * @param [in] callback Callback to use for iteration of trace types - * @param [in] userdata Data to supply to callback @ref aqlprofile_tracename_callback_t - * @retval HSA_STATUS_SUCCESS if successful - * @retval HSA_STATUS_ERROR if error on interation - * @retval OTHERS If @ref aqlprofile_eventname_callback_t returns non-HSA_STATUS_SUCCESS, - * that value is returned. - */ -PUBLIC_API hsa_status_t -aqlprofile_att_iterate_trace_type_ids(aqlprofile_att_tracename_callback_t callback, void* userdata); +} wave_data_t; /** * @brief Callback for rocprofiler to return ISA to aqlprofile ATT parser. @@ -560,7 +561,7 @@ typedef hsa_status_t (*aqlprofile_att_isa_callback_t)(char* isa_instruction, uint64_t* isa_memory_size, uint64_t* isa_size, uint64_t* source_size, - uint32_t marker_id, + uint64_t marker_id, uint64_t offset, void* userdata); @@ -603,11 +604,11 @@ typedef uint64_t (*aqlprofile_att_se_data_callback_t)(int* shader_engine_id /** * @brief Callback returning from aqlprofile_att_parser_iterate_event_list * @param[in] trace_event_id ID of the event. - * @param[in] trace_event_name Event name. + * @param[in] trace_event_metadata Null-terminated string, entries separated by ';' * @param[in] userdata userdata. */ typedef void (*aqlprofile_att_parser_iterate_event_cb_t)(int trace_event_id, - const char* trace_event_name, + const char* trace_event_metadata, void* userdata); /** @@ -615,7 +616,7 @@ typedef void (*aqlprofile_att_parser_iterate_event_cb_t)(int trace_event * @param[in] callback Callback where events are returned to. * @param[in] userdata userdata. */ -hsa_status_t +void aqlprofile_att_parser_iterate_event_list(aqlprofile_att_parser_iterate_event_cb_t callback, void* userdata); @@ -634,17 +635,17 @@ aqlprofile_att_parse_data(aqlprofile_att_se_data_callback_t se_data_callback, void* userdata); /** - * @brief Contains information of code objects. IDs can be reused for different load addresses. + * @brief Contains flags for how code objects are interpreted */ typedef union { - uint32_t raw; struct { uint32_t isUnload : 1; // 0 if code object is being loaded, 1 for unload uint32_t bFromStart : 1; // Has this code object been loaded before thread trace started? - uint32_t id : 30; // To be passed back to isa_string_callback in marker_id + uint32_t legacy_id : 30; // Legacy code object ID, if it fits in 30 bits. }; + uint32_t raw; } aqlprofile_att_header_marker_t; /** @@ -652,6 +653,7 @@ typedef union * @param[out] packets Returned packet * @param[in] handle The handle created from aqlprofile_att_create_packets() * @param[in] header Header containing code object information created from profiler + * @param[in] id To be passed back to isa_string_callback in marker_id * @param[in] addr Code object loaded address. * @param[in] size Code object loaded size. */ @@ -659,6 +661,10 @@ hsa_status_t aqlprofile_att_codeobj_load_marker(hsa_ext_amd_aql_pm4_packet_t* packets, aqlprofile_handle_t handle, aqlprofile_att_header_marker_t header, + uint64_t id, uint64_t addr, uint64_t size); + +#ifdef __cplusplus } +#endif diff --git a/source/lib/rocprofiler-sdk/aql/packet_construct.cpp b/source/lib/rocprofiler-sdk/aql/packet_construct.cpp index 9b149a8f0d..61bba70691 100644 --- a/source/lib/rocprofiler-sdk/aql/packet_construct.cpp +++ b/source/lib/rocprofiler-sdk/aql/packet_construct.cpp @@ -202,10 +202,10 @@ CounterPacketConstruct::construct_packet(const AmdExtTable& ext) #pragma GCC diagnostic ignored "-Wnarrowing" ThreadTraceAQLPacketFactory::ThreadTraceAQLPacketFactory( - const hsa::AgentCache& agent, - std::shared_ptr& params, - const CoreApiTable& coreapi, - const AmdExtTable& ext) + const hsa::AgentCache& agent, + std::shared_ptr& params, + const CoreApiTable& coreapi, + const AmdExtTable& ext) { this->tracepool = std::make_shared(); this->tracepool->allocate_fn = ext.hsa_amd_memory_pool_allocate_fn; @@ -231,20 +231,22 @@ ThreadTraceAQLPacketFactory::ThreadTraceAQLPacketFactory( std::unique_ptr ThreadTraceAQLPacketFactory::construct_packet() { - auto packet = std::make_unique(this->tracepool); - /*hsa_status_t _status = aqlprofile_att_create_packets(&packet->handle, + auto packet = std::make_unique(this->tracepool); + hsa_status_t _status = aqlprofile_att_create_packets(&packet->handle, &packet->packets, this->profile, &hsa::TraceAQLPacket::Alloc, &hsa::TraceAQLPacket::Free, &hsa::TraceAQLPacket::Copy, packet.get()); - CHECK_HSA(_status, "failed to create ATT packet");*/ + CHECK_HSA(_status, "failed to create ATT packet"); packet->before_krn_pkt.clear(); packet->after_krn_pkt.clear(); packet->packets.start_packet.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE; packet->packets.stop_packet.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE; + packet->packets.start_packet.completion_signal = hsa_signal_t{.handle = 0}; + packet->packets.stop_packet.completion_signal = hsa_signal_t{.handle = 0}; packet->empty = false; packet->start = packet->packets.start_packet; diff --git a/source/lib/rocprofiler-sdk/aql/packet_construct.hpp b/source/lib/rocprofiler-sdk/aql/packet_construct.hpp index ad080b41f1..6f46c13bc3 100644 --- a/source/lib/rocprofiler-sdk/aql/packet_construct.hpp +++ b/source/lib/rocprofiler-sdk/aql/packet_construct.hpp @@ -87,10 +87,10 @@ protected: class ThreadTraceAQLPacketFactory { public: - ThreadTraceAQLPacketFactory(const hsa::AgentCache& agent, - std::shared_ptr& params, - const CoreApiTable& coreapi, - const AmdExtTable& ext); + ThreadTraceAQLPacketFactory(const hsa::AgentCache& agent, + std::shared_ptr& params, + const CoreApiTable& coreapi, + const AmdExtTable& ext); std::unique_ptr construct_packet(); private: diff --git a/source/lib/rocprofiler-sdk/hsa/aql_packet.cpp b/source/lib/rocprofiler-sdk/hsa/aql_packet.cpp index d43a842cab..ec076b4011 100644 --- a/source/lib/rocprofiler-sdk/hsa/aql_packet.cpp +++ b/source/lib/rocprofiler-sdk/hsa/aql_packet.cpp @@ -64,13 +64,7 @@ CounterAQLPacket::~CounterAQLPacket() } } -TraceAQLPacket::~TraceAQLPacket() = default; -/* -TraceAQLPacket::~TraceAQLPacket() -{ - aqlprofile_att_delete_packets(this->handle); -} -*/ +TraceAQLPacket::~TraceAQLPacket() { aqlprofile_att_delete_packets(this->handle); } hsa_status_t TraceAQLPacket::Alloc(void** ptr, size_t size, aqlprofile_buffer_desc_flags_t flags, void* data) diff --git a/source/lib/rocprofiler-sdk/hsa/queue_controller.cpp b/source/lib/rocprofiler-sdk/hsa/queue_controller.cpp index dd5f09f73e..b3d7cd1fb1 100644 --- a/source/lib/rocprofiler-sdk/hsa/queue_controller.cpp +++ b/source/lib/rocprofiler-sdk/hsa/queue_controller.cpp @@ -278,14 +278,6 @@ QueueController::init(CoreApiTable& core_table, AmdExtTable& ext_table) break; } } - else if(itr->callback_tracer) - { - if(itr->callback_tracer->domains(ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH)) - { - enable_intercepter = true; - break; - } - } else if(itr->thread_trace) { enable_intercepter = true; @@ -300,6 +292,14 @@ QueueController::init(CoreApiTable& core_table, AmdExtTable& ext_table) }); break; } + else if(itr->callback_tracer) + { + if(itr->callback_tracer->domains(ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH)) + { + enable_intercepter = true; + break; + } + } } if(enable_intercepter) diff --git a/source/lib/rocprofiler-sdk/rocprofiler.cpp b/source/lib/rocprofiler-sdk/rocprofiler.cpp index 72d040da4b..4981d7c7aa 100644 --- a/source/lib/rocprofiler-sdk/rocprofiler.cpp +++ b/source/lib/rocprofiler-sdk/rocprofiler.cpp @@ -89,6 +89,8 @@ ROCPROFILER_STATUS_STRING( "AQL Profiler was not able to find event coordinates for defined counters") ROCPROFILER_STATUS_STRING(ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL, "A service depends on a newer version of KFD (amdgpu kernel driver)") +ROCPROFILER_STATUS_STRING(ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES, + "The given resources are insufficient to complete operation") ROCPROFILER_STATUS_STRING(ROCPROFILER_STATUS_ERROR_PROFILE_NOT_FOUND, "Could not find counter profile") ROCPROFILER_STATUS_STRING(ROCPROFILER_STATUS_ERROR_AGENT_DISPATCH_CONFLICT, diff --git a/source/lib/rocprofiler-sdk/thread_trace/CMakeLists.txt b/source/lib/rocprofiler-sdk/thread_trace/CMakeLists.txt index 8d54dc8e30..802ad64fbe 100644 --- a/source/lib/rocprofiler-sdk/thread_trace/CMakeLists.txt +++ b/source/lib/rocprofiler-sdk/thread_trace/CMakeLists.txt @@ -1,6 +1,8 @@ -set(ROCPROFILER_LIB_THREAD_TRACE_SOURCES att_core.cpp att_service.cpp) +set(ROCPROFILER_LIB_THREAD_TRACE_SOURCES att_core.cpp att_service.cpp att_parser.cpp) set(ROCPROFILER_LIB_THREAD_TRACE_HEADERS att_core.hpp) target_sources(rocprofiler-object-library PRIVATE ${ROCPROFILER_LIB_THREAD_TRACE_SOURCES} ${ROCPROFILER_LIB_THREAD_TRACE_HEADERS}) -# if(ROCPROFILER_BUILD_TESTS) add_subdirectory(tests) endif() +if(ROCPROFILER_BUILD_TESTS) + add_subdirectory(tests) +endif() diff --git a/source/lib/rocprofiler-sdk/thread_trace/att_core.cpp b/source/lib/rocprofiler-sdk/thread_trace/att_core.cpp index 0a645ce081..05f67ac103 100644 --- a/source/lib/rocprofiler-sdk/thread_trace/att_core.cpp +++ b/source/lib/rocprofiler-sdk/thread_trace/att_core.cpp @@ -131,7 +131,7 @@ thread_trace_callback(uint32_t shader, void* buffer, uint64_t size, void* callba return HSA_STATUS_SUCCESS; } - callback_fn(shader, 0, "", cpu_data.data(), size, tool_userdata); + callback_fn(shader, cpu_data.data(), size, tool_userdata); return HSA_STATUS_SUCCESS; } @@ -140,15 +140,14 @@ post_kernel_call(ThreadTracer& tracer, inst_pkt_t& aql) { std::vector cpu_data{}; auto pair = cbdata_t{tracer.params->callback_userdata, tracer.params->shader_cb_fn, &cpu_data}; - (void) pair; for(auto& aql_pkt : aql) { auto* pkt = dynamic_cast(aql_pkt.first.get()); if(!pkt) continue; - // auto status = aqlprofile_att_iterate_data(pkt->GetHandle(), thread_trace_callback, - // &pair); CHECK_HSA(status, "Failed to iterate ATT data"); + auto status = aqlprofile_att_iterate_data(pkt->GetHandle(), thread_trace_callback, &pair); + CHECK_HSA(status, "Failed to iterate ATT data"); std::lock_guard lk(tracer.trace_resources_mut); if(tracer.agent_active_queues.find(pkt->GetAgent()) != tracer.agent_active_queues.end()) diff --git a/source/lib/rocprofiler-sdk/thread_trace/att_core.hpp b/source/lib/rocprofiler-sdk/thread_trace/att_core.hpp index f2a54580ac..2cfca4abc3 100644 --- a/source/lib/rocprofiler-sdk/thread_trace/att_core.hpp +++ b/source/lib/rocprofiler-sdk/thread_trace/att_core.hpp @@ -37,7 +37,7 @@ namespace rocprofiler { -struct thread_trace_parameters +struct thread_trace_parameter_pack { rocprofiler_context_id_t context_id; rocprofiler_att_dispatch_callback_t dispatch_cb_fn; @@ -45,18 +45,18 @@ struct thread_trace_parameters void* callback_userdata; // Parameters - rocprofiler_att_parameter_flag_t flags; - uint64_t buffer_size; - uint8_t target_cu; - uint8_t simd_select; - uint8_t reserved; - uint8_t vmid_mask; - uint16_t perfcounter_mask; - uint8_t perfcounter_ctrl; - uint64_t shader_engine_mask; + uint8_t target_cu = 1; + uint8_t simd_select = DEFAULT_SIMD; + uint8_t perfcounter_ctrl = 0; + uint64_t shader_engine_mask = DEFAULT_SE_MASK; + uint64_t buffer_size = DEFAULT_BUFFER_SIZE; // GFX9 Only std::vector perfcounters; + + static constexpr size_t DEFAULT_SIMD = 0x7; + static constexpr size_t DEFAULT_SE_MASK = 0x21; + static constexpr size_t DEFAULT_BUFFER_SIZE = 0x6000000; }; namespace hsa @@ -67,7 +67,7 @@ class AQLPacket; class ThreadTracer { public: - ThreadTracer(std::shared_ptr& _params) + ThreadTracer(std::shared_ptr& _params) : params(_params){}; virtual void start_context(); virtual void stop_context(); @@ -76,7 +76,7 @@ public: virtual ~ThreadTracer() = default; std::mutex trace_resources_mut; - std::shared_ptr params; + std::shared_ptr params; std::unordered_map> resources; std::unordered_map> agent_active_queues; }; // namespace thread_trace diff --git a/source/lib/rocprofiler-sdk/thread_trace/att_parser.cpp b/source/lib/rocprofiler-sdk/thread_trace/att_parser.cpp new file mode 100644 index 0000000000..f5d2a70871 --- /dev/null +++ b/source/lib/rocprofiler-sdk/thread_trace/att_parser.cpp @@ -0,0 +1,212 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include +#include +#include +#include +#include +#include "lib/rocprofiler-sdk/aql/aql_profile_v2.h" + +#define AQLPROFILE_OCCUPANCY_RESOLUTION 8 + +namespace rocprofiler +{ +namespace att_parser +{ +hsa_status_t +forward_hsa_error(rocprofiler_status_t error_code) +{ + static thread_local std::unordered_map error_fwd = { + {ROCPROFILER_STATUS_SUCCESS, HSA_STATUS_SUCCESS}, + {ROCPROFILER_STATUS_ERROR, HSA_STATUS_ERROR}, + {ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT, HSA_STATUS_ERROR_INVALID_ARGUMENT}, + {ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES, HSA_STATUS_ERROR_OUT_OF_RESOURCES}, + }; + + try + { + return error_fwd.at(error_code); + } catch(std::exception& e) + {} + + return HSA_STATUS_ERROR; +} + +rocprofiler_status_t +forward_hsa_error(hsa_status_t error_code) +{ + static thread_local std::unordered_map error_fwd = { + {HSA_STATUS_SUCCESS, ROCPROFILER_STATUS_SUCCESS}, + {HSA_STATUS_ERROR, ROCPROFILER_STATUS_ERROR}, + {HSA_STATUS_ERROR_INVALID_ARGUMENT, ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT}, + {HSA_STATUS_ERROR_OUT_OF_RESOURCES, ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES}, + }; + + try + { + return error_fwd.at(error_code); + } catch(std::exception& e) + {} + + return ROCPROFILER_STATUS_ERROR; +} + +struct userdata_callback_table_t +{ + rocprofiler_att_parser_trace_callback_t trace; + rocprofiler_att_parser_isa_callback_t isa; + rocprofiler_att_parser_se_data_callback_t se_data; + void* user; + + std::vector kernel_id_map; +}; + +thread_local int TRACE_DATA_ID{-1}; +thread_local int KERNEL_ADDR_ID{-1}; +thread_local int OCCUPANCY_ID{-1}; + +void +iterate_trace_type(int id, const char* metadata, void*) +{ + if(std::string_view(metadata).find("occupancy") == 0) + OCCUPANCY_ID = id; + else if(std::string_view(metadata).find("kernel_ids_addr") == 0) + KERNEL_ADDR_ID = id; + else if(std::string_view(metadata).find("tracedata") == 0) + TRACE_DATA_ID = id; +} + +hsa_status_t +trace_callback(int trace_type_id, + int /* correlation_id */, + void* trace_events, + uint64_t trace_size, + void* userdata) +{ + assert(userdata); + auto& table = *reinterpret_cast(userdata); + + if(trace_type_id == KERNEL_ADDR_ID) + { + table.kernel_id_map.resize(trace_size); + const auto* events = reinterpret_cast(trace_events); + + for(size_t i = 0; i < trace_size; i++) + table.kernel_id_map.at(i) = events[i]; + } + else if(trace_type_id == OCCUPANCY_ID) + { + const auto* events = reinterpret_cast(trace_events); + for(size_t i = 0; i < trace_size; i++) + { + rocprofiler_att_data_type_occupancy_t occ{}; + occ.timestamp = events[i].time * AQLPROFILE_OCCUPANCY_RESOLUTION; + occ.enabled = events[i].enable; + try + { + pcinfo_t kernel_id_addr = table.kernel_id_map.at(events[i].kernel_id); + occ.marker_id = kernel_id_addr.marker_id; + occ.offset = kernel_id_addr.addr; + } catch(...) + {} // Not having a kernel_id_map entry is unexpected, but valid + table.trace(ROCPROFILER_ATT_PARSER_DATA_TYPE_OCCUPANCY, (void*) &occ, table.user); + } + } + else if(trace_type_id == TRACE_DATA_ID) + { + const auto* events = reinterpret_cast(trace_events); + for(size_t i = 0; i < trace_size; i++) + { + rocprofiler_att_data_type_isa_t isa{}; + isa.marker_id = events[i].pc.marker_id; + isa.offset = events[i].pc.addr; + isa.hitcount = events[i].hitcount; + isa.latency = events[i].latency; + table.trace(ROCPROFILER_ATT_PARSER_DATA_TYPE_ISA, (void*) &isa, table.user); + } + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t +isa_callback(char* isa, + char* /* source_reference */, + uint64_t* memory_size, + uint64_t* isa_size, + uint64_t* source_size, + uint64_t marker, + uint64_t offset, + void* userdata) +{ + assert(userdata); + assert(source_size); + *source_size = 0; + const auto& table = *reinterpret_cast(userdata); + rocprofiler_status_t status = table.isa(isa, memory_size, isa_size, marker, offset, table.user); + + if(status != ROCPROFILER_STATUS_SUCCESS) + return rocprofiler::att_parser::forward_hsa_error(status); + return HSA_STATUS_SUCCESS; +} + +uint64_t +se_data_callback(int* seid, uint8_t** buffer, uint64_t* buffer_size, void* userdata) +{ + assert(userdata); + auto& table = *reinterpret_cast(userdata); + return table.se_data(seid, buffer, buffer_size, table.user); +} + +}; // namespace att_parser +}; // namespace rocprofiler + +extern "C" { +rocprofiler_status_t ROCPROFILER_API +rocprofiler_att_parse_data(rocprofiler_att_parser_se_data_callback_t user_se_data_callback, + rocprofiler_att_parser_trace_callback_t user_trace_callback, + rocprofiler_att_parser_isa_callback_t user_isa_callback, + void* userdata) +{ + static thread_local bool bInit = []() { + aqlprofile_att_parser_iterate_event_list(rocprofiler::att_parser::iterate_trace_type, + nullptr); + return true; + }(); + (void) bInit; + + rocprofiler::att_parser::userdata_callback_table_t table; + table.trace = user_trace_callback; + table.isa = user_isa_callback; + table.se_data = user_se_data_callback; + table.user = userdata; + + hsa_status_t status = aqlprofile_att_parse_data(rocprofiler::att_parser::se_data_callback, + rocprofiler::att_parser::trace_callback, + rocprofiler::att_parser::isa_callback, + (void*) &table); + + if(status != HSA_STATUS_SUCCESS) return rocprofiler::att_parser::forward_hsa_error(status); + return ROCPROFILER_STATUS_SUCCESS; +} +} diff --git a/source/lib/rocprofiler-sdk/thread_trace/att_service.cpp b/source/lib/rocprofiler-sdk/thread_trace/att_service.cpp index eb5b209ab2..56c2043499 100644 --- a/source/lib/rocprofiler-sdk/thread_trace/att_service.cpp +++ b/source/lib/rocprofiler-sdk/thread_trace/att_service.cpp @@ -27,21 +27,10 @@ #include "lib/rocprofiler-sdk/hsa/agent_cache.hpp" extern "C" { -/** - * @brief Configure buffered dispatch profile Counting Service. - * Collects the counters in dispatch packets and stores them - * in buffer_id. The buffer may contain packets from more than - * one dispatch (denoted by correlation id). Will trigger the - * callback based on the parameters setup in buffer_id_t. - * - * @param [in] context_id context id - * @param [in] buffer_id id of the buffer to use for the counting service - * @param [in] profile profile config to use for dispatch - * @return ::rocprofiler_status_t - */ rocprofiler_status_t ROCPROFILER_API rocprofiler_configure_thread_trace_service(rocprofiler_context_id_t context_id, - rocprofiler_att_parameters_t parameters, + rocprofiler_att_parameter_t* parameters, + size_t num_parameters, rocprofiler_att_dispatch_callback_t dispatch_callback, rocprofiler_att_shader_data_callback_t shader_callback, void* callback_userdata) @@ -49,30 +38,43 @@ rocprofiler_configure_thread_trace_service(rocprofiler_context_id_t auto* ctx = rocprofiler::context::get_mutable_registered_context(context_id); if(!ctx) return ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_STARTED; if(ctx->thread_trace) return ROCPROFILER_STATUS_ERROR_SERVICE_ALREADY_CONFIGURED; - if(parameters.flags.raw != 0) return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED; - auto thread_tracer = std::make_shared(); + auto thread_tracer = std::make_shared(); thread_tracer->context_id = context_id; thread_tracer->dispatch_cb_fn = dispatch_callback; thread_tracer->shader_cb_fn = shader_callback; thread_tracer->callback_userdata = callback_userdata; - thread_tracer->flags = parameters.flags; - thread_tracer->buffer_size = parameters.buffer_size; - thread_tracer->target_cu = parameters.target_cu; - thread_tracer->simd_select = parameters.simd_select; - thread_tracer->vmid_mask = parameters.vmid_mask; + for(size_t p = 0; p < num_parameters; p++) + { + const rocprofiler_att_parameter_t& param = parameters[p]; + if(param.type > ROCPROFILER_ATT_PARAMETER_LAST) + return ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT; - thread_tracer->perfcounter_mask = parameters.perfcounter_mask; - thread_tracer->perfcounter_ctrl = parameters.perfcounter_ctrl; - - for(int i = 0; i < parameters.perfcounter_num; i++) - thread_tracer->perfcounters.emplace_back(parameters.perfcounter[i]); - - thread_tracer->shader_engine_mask = 0; - for(int i = 0; i < parameters.shader_num; i++) - thread_tracer->shader_engine_mask |= 1ul << parameters.shader_ids[i]; + switch(param.type) + { + case ROCPROFILER_ATT_PARAMETER_TARGET_CU: thread_tracer->target_cu = param.value; break; + case ROCPROFILER_ATT_PARAMETER_SHADER_ENGINE_MASK: + thread_tracer->shader_engine_mask = param.value; + break; + case ROCPROFILER_ATT_PARAMETER_BUFFER_SIZE: + thread_tracer->buffer_size = param.value; + break; + case ROCPROFILER_ATT_PARAMETER_SIMD_SELECT: + thread_tracer->simd_select = param.value; + break; + case ROCPROFILER_ATT_PARAMETER_OCCUPANCY_MODE_ENABLE: + return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED; + case ROCPROFILER_ATT_PARAMETER_PERFCOUNTERS_CTRL: + return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED; + case ROCPROFILER_ATT_PARAMETER_PERFCOUNTER: + return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED; + case ROCPROFILER_ATT_PARAMETER_LAST: return ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT; + } + // for(int i = 0; i < parameters.perfcounter_num; i++) + // thread_tracer->perfcounters.emplace_back(parameters.perfcounter[i]); + } ctx->thread_trace = std::make_shared(thread_tracer); diff --git a/source/lib/rocprofiler-sdk/thread_trace/tests/CMakeLists.txt b/source/lib/rocprofiler-sdk/thread_trace/tests/CMakeLists.txt new file mode 100644 index 0000000000..00bd67e403 --- /dev/null +++ b/source/lib/rocprofiler-sdk/thread_trace/tests/CMakeLists.txt @@ -0,0 +1,24 @@ +rocprofiler_deactivate_clang_tidy() + +include(GoogleTest) + +set(ROCPROFILER_THREAD_TRACE_TEST_SOURCES "att_packet_test.cpp") + +add_executable(thread-trace-packet-test) + +target_sources(thread-trace-packet-test PRIVATE ${ROCPROFILER_THREAD_TRACE_TEST_SOURCES}) + +target_link_libraries( + thread-trace-packet-test + PRIVATE rocprofiler::rocprofiler-static-library rocprofiler::rocprofiler-glog + rocprofiler::rocprofiler-hsa-runtime rocprofiler::rocprofiler-hip + rocprofiler::rocprofiler-common-library GTest::gtest GTest::gtest_main) + +gtest_add_tests( + TARGET thread-trace-packet-test + SOURCES ${ROCPROFILER_THREAD_TRACE_TEST_SOURCES} + TEST_LIST thread-trace-packet-test_TESTS + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + +set_tests_properties(${thread-trace-packet-test_TESTS} PROPERTIES TIMEOUT 10 LABELS + "unittests") diff --git a/source/lib/rocprofiler-sdk/thread_trace/tests/att_packet_test.cpp b/source/lib/rocprofiler-sdk/thread_trace/tests/att_packet_test.cpp new file mode 100644 index 0000000000..a8d83a9bef --- /dev/null +++ b/source/lib/rocprofiler-sdk/thread_trace/tests/att_packet_test.cpp @@ -0,0 +1,154 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include +#include + +#include +#include +#include +#include "lib/rocprofiler-sdk/context/context.hpp" +#include "lib/rocprofiler-sdk/registration.hpp" + +#include +#include +#include + +#include "lib/rocprofiler-sdk/agent.hpp" +#include "lib/rocprofiler-sdk/aql/helpers.hpp" +#include "lib/rocprofiler-sdk/aql/packet_construct.hpp" +#include "lib/rocprofiler-sdk/counters/metrics.hpp" +#include "lib/rocprofiler-sdk/hsa/agent_cache.hpp" +#include "lib/rocprofiler-sdk/hsa/queue.hpp" +#include "lib/rocprofiler-sdk/hsa/queue_controller.hpp" + +#define ROCPROFILER_CALL(ARG, MSG) \ + { \ + auto _status = (ARG); \ + EXPECT_EQ(_status, ROCPROFILER_STATUS_SUCCESS) << MSG << " :: " << #ARG; \ + } + +namespace rocprofiler +{ +AmdExtTable& +get_ext_table() +{ + static auto _v = []() { + auto val = AmdExtTable{}; + val.hsa_amd_memory_pool_get_info_fn = hsa_amd_memory_pool_get_info; + val.hsa_amd_agent_iterate_memory_pools_fn = hsa_amd_agent_iterate_memory_pools; + val.hsa_amd_memory_pool_allocate_fn = hsa_amd_memory_pool_allocate; + val.hsa_amd_memory_pool_free_fn = hsa_amd_memory_pool_free; + val.hsa_amd_agent_memory_pool_get_info_fn = hsa_amd_agent_memory_pool_get_info; + val.hsa_amd_agents_allow_access_fn = hsa_amd_agents_allow_access; + return val; + }(); + return _v; +} + +CoreApiTable& +get_api_table() +{ + static auto _v = []() { + auto val = CoreApiTable{}; + val.hsa_iterate_agents_fn = hsa_iterate_agents; + val.hsa_agent_get_info_fn = hsa_agent_get_info; + val.hsa_queue_create_fn = hsa_queue_create; + val.hsa_queue_destroy_fn = hsa_queue_destroy; + val.hsa_signal_wait_relaxed_fn = hsa_signal_wait_relaxed; + return val; + }(); + return _v; +} + +void +test_init() +{ + HsaApiTable table; + table.amd_ext_ = &get_ext_table(); + table.core_ = &get_api_table(); + agent::construct_agent_cache(&table); + ASSERT_TRUE(hsa::get_queue_controller() != nullptr); + hsa::get_queue_controller()->init(get_api_table(), get_ext_table()); +} + +} // namespace rocprofiler + +using namespace rocprofiler::aql; + +TEST(thread_trace, construct_default_packets) +{ + ASSERT_EQ(hsa_init(), HSA_STATUS_SUCCESS); + rocprofiler::test_init(); + auto agents = rocprofiler::hsa::get_queue_controller()->get_supported_agents(); + ASSERT_GT(agents.size(), 0); + for(const auto& [_, agent] : agents) + { + auto params = std::make_shared(); + + ThreadTraceAQLPacketFactory factory( + agent, params, rocprofiler::get_api_table(), rocprofiler::get_ext_table()); + + auto packet = factory.construct_packet(); + + size_t vendor_packet = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE; + ASSERT_TRUE(packet->start.header == vendor_packet); + ASSERT_TRUE(packet->stop.header == vendor_packet); + ASSERT_TRUE(packet->before_krn_pkt.size() > 0); + ASSERT_TRUE(packet->after_krn_pkt.size() > 0); + } + hsa_shut_down(); +} + +TEST(thread_trace, configure_test) +{ + rocprofiler::test_init(); + + rocprofiler::registration::init_logging(); + rocprofiler::registration::set_init_status(-1); + rocprofiler::context::push_client(1); + rocprofiler_context_id_t ctx; + ROCPROFILER_CALL(rocprofiler_create_context(&ctx), "context creation failed"); + + std::vector params; + params.push_back({ROCPROFILER_ATT_PARAMETER_TARGET_CU, 1}); + params.push_back({ROCPROFILER_ATT_PARAMETER_SHADER_ENGINE_MASK, 0xF}); + params.push_back({ROCPROFILER_ATT_PARAMETER_BUFFER_SIZE, 0x1000000}); + params.push_back({ROCPROFILER_ATT_PARAMETER_SIMD_SELECT, 0xF}); + + rocprofiler_configure_thread_trace_service( + ctx, + params.data(), + params.size(), + [](rocprofiler_queue_id_t, + const rocprofiler_agent_t*, + rocprofiler_correlation_id_t, + const hsa_kernel_dispatch_packet_t*, + uint64_t, + void*) { return ROCPROFILER_ATT_CONTROL_NONE; }, + [](int64_t, void*, size_t, void*) {}, + nullptr); + + ASSERT_EQ(hsa_init(), HSA_STATUS_SUCCESS); + ROCPROFILER_CALL(rocprofiler_start_context(ctx), "context start failed"); + ROCPROFILER_CALL(rocprofiler_stop_context(ctx), "context stop failed"); +} diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ad658a6552..128ebdb4f6 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -57,6 +57,7 @@ add_subdirectory(async-copy-tracing) add_subdirectory(scratch-memory-tracing) add_subdirectory(c-tool) add_subdirectory(page-migration) +add_subdirectory(thread-trace) # rocprofv3 validation tests add_subdirectory(rocprofv3) diff --git a/tests/thread-trace/CMakeLists.txt b/tests/thread-trace/CMakeLists.txt new file mode 100644 index 0000000000..e5018f332e --- /dev/null +++ b/tests/thread-trace/CMakeLists.txt @@ -0,0 +1,67 @@ +# +# +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +if(NOT CMAKE_HIP_COMPILER) + find_program( + amdclangpp_EXECUTABLE + NAMES amdclang++ + HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATH_SUFFIXES bin llvm/bin NO_CACHE) + mark_as_advanced(amdclangpp_EXECUTABLE) + + if(amdclangpp_EXECUTABLE) + set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}") + endif() +endif() + +project( + rocprofiler-tests-thread-trace + LANGUAGES CXX HIP + VERSION 0.0.0) + +find_package(rocprofiler-sdk REQUIRED) +find_package(amd_comgr REQUIRED) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_HIP_STANDARD 17) +set(CMAKE_HIP_EXTENSIONS OFF) +set(CMAKE_HIP_STANDARD_REQUIRED ON) + +foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) + if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "") + set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}") + endif() +endforeach() + +add_library(thread-trace-api-test-lib SHARED) +target_sources(thread-trace-api-test-lib PRIVATE verify_data.cpp) +target_link_libraries( + thread-trace-api-test-lib PRIVATE rocprofiler::rocprofiler rocprofiler-sdk-codeobj + amd_comgr dw) + +set_source_files_properties(kernel_run.cpp PROPERTIES COMPILE_FLAGS "-g -O2") +set_source_files_properties(kernel_run.cpp PROPERTIES LANGUAGE HIP) + +add_executable(thread-trace-api-test-binary) +target_sources(thread-trace-api-test-binary PRIVATE kernel_run.cpp) + +if(ROCPROFILER_MEMCHECK_PRELOAD_ENV) + set(PRELOAD_ENV + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}:$") +else() + set(PRELOAD_ENV "LD_PRELOAD=$") +endif() + +target_link_libraries(thread-trace-api-test-binary PRIVATE thread-trace-api-test-lib) + +add_test(NAME thread-trace-api-tests COMMAND $) + +set_tests_properties( + thread-trace-api-tests + PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT "${PRELOAD_ENV}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") diff --git a/tests/thread-trace/kernel_run.cpp b/tests/thread-trace/kernel_run.cpp new file mode 100644 index 0000000000..c174db8483 --- /dev/null +++ b/tests/thread-trace/kernel_run.cpp @@ -0,0 +1,93 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +// undefine NDEBUG so asserts are implemented +#ifdef NDEBUG +# undef NDEBUG +#endif + +/** + * @file samples/code_object_isa_decode/client.cpp + * + * @brief Example rocprofiler client (tool) + */ + +#include +#include +#include +#include "hip/hip_runtime.h" + +// Three waves per SIMD on MI300 +#define DATA_SIZE (304 * 64 * 4 * 3) +#define HIP_API_CALL(CALL) assert((CALL) == hipSuccess) + +template +__global__ void +branching_kernel(T* __restrict__ a, + const float* __restrict__ b, + const float* __restrict__ c, + int size) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if(blockIdx.x % 2 == 0) + a[index] = b[index] + c[index]; + else + a[index] = b[index] * c[index] - 2.0f; +} + +class hipMemory +{ +public: + hipMemory(size_t size) + { + HIP_API_CALL(hipMalloc(&ptr, size * sizeof(float))); + HIP_API_CALL(hipMemset(ptr, 0, size * sizeof(float))); + } + ~hipMemory() + { + if(ptr) HIP_API_CALL(hipFree(ptr)); + } + float* ptr = nullptr; +}; + +int +main(int argc, char** argv) +{ + hipMemory src1(DATA_SIZE); + hipMemory src2(DATA_SIZE); + hipMemory dst(DATA_SIZE); + + hipLaunchKernelGGL(branching_kernel, + dim3(DATA_SIZE / 64), + dim3(64), + 0, + 0, + dst.ptr, + src1.ptr, + src2.ptr, + DATA_SIZE); + + HIP_API_CALL(hipGetLastError()); + HIP_API_CALL(hipDeviceSynchronize()); + + return 0; +} \ No newline at end of file diff --git a/tests/thread-trace/verify_data.cpp b/tests/thread-trace/verify_data.cpp new file mode 100644 index 0000000000..d148504406 --- /dev/null +++ b/tests/thread-trace/verify_data.cpp @@ -0,0 +1,414 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +// undefine NDEBUG so asserts are implemented +#ifdef NDEBUG +# undef NDEBUG +#endif + +/** + * @file samples/code_object_isa_decode/client.cpp + * + * @brief Example rocprofiler client (tool) + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define WAVE_RATIO_TOLERANCE 0.05 + +#define ROCPROFILER_CALL(result, msg) \ + { \ + rocprofiler_status_t CHECKSTATUS = result; \ + if(CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS) \ + { \ + std::string status_msg = rocprofiler_get_status_string(CHECKSTATUS); \ + std::cerr << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg \ + << " failed with error code " << CHECKSTATUS << ": " << status_msg \ + << std::endl; \ + std::stringstream errmsg{}; \ + errmsg << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg " failure (" \ + << status_msg << ")"; \ + throw std::runtime_error(errmsg.str()); \ + } \ + } + +#define C_API_BEGIN \ + try \ + { +#define C_API_END \ + } \ + catch(std::exception & e) \ + { \ + std::cerr << "Error in " << __FILE__ << ':' << __LINE__ << ' ' << e.what() << std::endl; \ + } \ + catch(...) { std::cerr << "Error in " << __FILE__ << ':' << __LINE__ << std::endl; } + +namespace thread_trace_test_client +{ +using code_obj_load_data_t = rocprofiler_callback_tracing_code_object_load_data_t; +using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; +using Instruction = rocprofiler::codeobj::disassembly::Instruction; +using CodeobjAddressTranslate = rocprofiler::codeobj::disassembly::CodeobjAddressTranslate; + +std::mutex isa_map_mut; +rocprofiler_client_id_t* client_id = nullptr; + +struct isa_map_elem_t +{ + std::atomic hitcount{0}; + std::atomic latency{0}; + std::shared_ptr code_line{nullptr}; +}; + +struct pcinfo_t +{ + uint64_t marker_id; + uint64_t addr; +}; + +bool +operator==(const pcinfo_t& a, const pcinfo_t& b) +{ + return a.addr == b.addr && a.marker_id == b.marker_id; +}; + +bool +operator<(const pcinfo_t& a, const pcinfo_t& b) +{ + if(a.marker_id == b.marker_id) return a.addr < b.addr; + return a.marker_id < b.marker_id; +}; + +struct ToolData +{ + std::unordered_map kernel_object_to_kernel_name = {}; + CodeobjAddressTranslate codeobjTranslate; + std::map> isa_map; + std::atomic waves_started = 0; + std::atomic waves_ended = 0; +}; + +struct trace_data_t +{ + int64_t id; + uint8_t* data; + uint64_t size; + ToolData* tool; +}; + +void +tool_codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* /* user_data */, + void* callback_data) +{ + C_API_BEGIN + if(record.kind != ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT) return; + if(record.phase != ROCPROFILER_CALLBACK_PHASE_LOAD) return; + + assert(callback_data && "Shader callback passed null!"); + ToolData& tool = *reinterpret_cast(callback_data); + + if(record.operation == ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER) + { + std::unique_lock lg(isa_map_mut); + auto* data = static_cast(record.payload); + tool.kernel_object_to_kernel_name.emplace(data->kernel_object, data->kernel_name); + } + + if(record.operation != ROCPROFILER_CODE_OBJECT_LOAD) return; + + auto* data = static_cast(record.payload); + if(!data || !data->uri) return; + + std::unique_lock lg(isa_map_mut); + + if(std::string_view(data->uri).find("file:///") == 0) + { + tool.codeobjTranslate.addDecoder(data->uri, 0, data->load_delta, data->load_size); + } + else + { + tool.codeobjTranslate.addDecoder(reinterpret_cast(data->memory_base), + data->memory_size, + data->code_object_id, + data->load_delta, + data->load_size); + } + + C_API_END +} + +rocprofiler_att_control_flags_t +dispatch_callback(rocprofiler_queue_id_t /* queue_id */, + const rocprofiler_agent_t* /* agent */, + rocprofiler_correlation_id_t /* correlation_id */, + const hsa_kernel_dispatch_packet_t* dispatch_packet, + uint64_t /* kernel_id */, + void* userdata) +{ + C_API_BEGIN + assert(userdata && "Dispatch callback passed null!"); + ToolData& tool = *reinterpret_cast(userdata); + + static std::atomic call_id{0}; + static std::string_view desired_func_name = "branching_kernel"; + + try + { + auto& kernel_name = tool.kernel_object_to_kernel_name.at(dispatch_packet->kernel_object); + if(kernel_name.find(desired_func_name) == std::string::npos) + return ROCPROFILER_ATT_CONTROL_NONE; + + if(call_id.fetch_add(1) == 0) return ROCPROFILER_ATT_CONTROL_START_AND_STOP; + } catch(...) + { + std::cerr << "Could not find kernel object: " << dispatch_packet->kernel_object + << std::endl; + } + + C_API_END + return ROCPROFILER_ATT_CONTROL_NONE; +} + +void +get_trace_data(rocprofiler_att_parser_data_type_t type, void* att_data, void* userdata) +{ + C_API_BEGIN + assert(userdata && "ISA callback passed null!"); + trace_data_t& trace_data = *reinterpret_cast(userdata); + assert(trace_data.tool && "ISA callback passed null!"); + ToolData& tool = *reinterpret_cast(trace_data.tool); + + if(type == ROCPROFILER_ATT_PARSER_DATA_TYPE_OCCUPANCY) + { + const auto& ev = reinterpret_cast(att_data); + if(ev->enabled) + tool.waves_started.fetch_add(1); + else + tool.waves_ended.fetch_add(1); + } + + if(type != ROCPROFILER_ATT_PARSER_DATA_TYPE_ISA) return; + + std::unique_lock lk(isa_map_mut); + auto& event = *reinterpret_cast(att_data); + + pcinfo_t pc{event.marker_id, event.offset}; + auto it = tool.isa_map.find(pc); + if(it == tool.isa_map.end()) + { + auto ptr = std::make_unique(); + try + { + ptr->code_line = tool.codeobjTranslate.get(pc.marker_id, pc.addr); + } catch(...) + { + return; + } + it = tool.isa_map.emplace(pc, std::move(ptr)).first; + } + + it->second->hitcount.fetch_add(event.hitcount, std::memory_order_relaxed); + it->second->latency.fetch_add(event.latency, std::memory_order_relaxed); + C_API_END +} + +uint64_t +copy_trace_data(int* seid, uint8_t** buffer, uint64_t* buffer_size, void* userdata) +{ + trace_data_t& data = *reinterpret_cast(userdata); + *seid = data.id; + *buffer_size = data.size; + *buffer = data.data; + data.size = 0; + return *buffer_size; +} + +rocprofiler_status_t +isa_callback(char* isa_instruction, + uint64_t* isa_memory_size, + uint64_t* isa_size, + uint64_t marker_id, + uint64_t offset, + void* userdata) +{ + C_API_BEGIN + assert(userdata && "ISA callback passed null!"); + trace_data_t& trace_data = *reinterpret_cast(userdata); + assert(trace_data.tool && "ISA callback passed null!"); + ToolData& tool = *reinterpret_cast(trace_data.tool); + + std::shared_ptr instruction; + + { + std::unique_lock unique_lock(isa_map_mut); + instruction = tool.codeobjTranslate.get(marker_id, offset); + } + + if(!instruction.get()) return ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT; + + { + size_t tmp_isa_size = *isa_size; + *isa_size = instruction->inst.size(); + + if(*isa_size > tmp_isa_size) return ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES; + } + + memcpy(isa_instruction, instruction->inst.data(), *isa_size); + *isa_memory_size = instruction->size; + + auto ptr = std::make_unique(); + ptr->code_line = std::move(instruction); + tool.isa_map.emplace(pcinfo_t{marker_id, offset}, std::move(ptr)); + C_API_END + return ROCPROFILER_STATUS_SUCCESS; +} + +void +shader_data_callback(int64_t se_id, void* se_data, size_t data_size, void* userdata) +{ + C_API_BEGIN + assert(userdata && "Shader callback passed null!"); + ToolData& tool = *reinterpret_cast(userdata); + + trace_data_t data{.id = se_id, .data = (uint8_t*) se_data, .size = data_size, .tool = &tool}; + auto status = rocprofiler_att_parse_data(copy_trace_data, get_trace_data, isa_callback, &data); + if(status != ROCPROFILER_STATUS_SUCCESS) + std::cerr << "shader_data_callback failed with status " << status << std::endl; + C_API_END +} + +int +tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) +{ + static rocprofiler_context_id_t client_ctx = {}; + (void) fini_func; + ROCPROFILER_CALL(rocprofiler_create_context(&client_ctx), "context creation"); + + ROCPROFILER_CALL( + rocprofiler_configure_callback_tracing_service(client_ctx, + ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, + nullptr, + 0, + tool_codeobj_tracing_callback, + tool_data), + "code object tracing service configure"); + + ROCPROFILER_CALL( + rocprofiler_configure_thread_trace_service( + client_ctx, nullptr, 0, dispatch_callback, shader_data_callback, tool_data), + "thread trace service configure"); + + int valid_ctx = 0; + ROCPROFILER_CALL(rocprofiler_context_is_valid(client_ctx, &valid_ctx), + "context validity check"); + if(valid_ctx == 0) + { + // notify rocprofiler that initialization failed + // and all the contexts, buffers, etc. created + // should be ignored + return -1; + } + + ROCPROFILER_CALL(rocprofiler_start_context(client_ctx), "context start"); + + // no errors + return 0; +} + +void +tool_fini(void* tool_data) +{ + assert(tool_data && "tool_fini callback passed null!"); + ToolData& tool = *reinterpret_cast(tool_data); + + std::unique_lock isa_lk(isa_map_mut); + + // Find largest instruction + size_t max_inst_size = 0; + for(auto& [addr, lines] : tool.isa_map) + if(lines.get()) max_inst_size = std::max(max_inst_size, lines->code_line->inst.size()); + + assert(max_inst_size > 0); + + size_t total_hit = 0; + size_t total_cycles = 0; + + for(auto& [addr, line] : tool.isa_map) + { + total_hit += line->hitcount.load(std::memory_order_relaxed); + total_cycles += line->latency.load(std::memory_order_relaxed); + } + + assert(total_cycles > 0); + assert(total_hit > 0); + + double wave_started = (double) tool.waves_started.load(); + double wave_event_ratio = wave_started / (wave_started + (double) tool.waves_ended.load()); + assert(wave_event_ratio > 0.5 - WAVE_RATIO_TOLERANCE); + assert(wave_event_ratio < 0.5 + WAVE_RATIO_TOLERANCE); +} + +} // namespace thread_trace_test_client + +extern "C" rocprofiler_tool_configure_result_t* +rocprofiler_configure(uint32_t /* version */, + const char* /* runtime_version */, + uint32_t priority, + rocprofiler_client_id_t* id) +{ + // only activate if main tool + if(priority > 0) return nullptr; + + // set the client name + id->name = "Adv_Thread_Trace_Sample"; + + // store client info + thread_trace_test_client::client_id = id; + + auto* data = new thread_trace_test_client::ToolData{}; + + // create configure data + static auto cfg = + rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t), + &thread_trace_test_client::tool_init, + &thread_trace_test_client::tool_fini, + reinterpret_cast(data)}; + + // return pointer to configure data + return &cfg; +}