diff --git a/samples/advanced_thread_trace/client.cpp b/samples/advanced_thread_trace/client.cpp index 871d8f5495..de3ed947f6 100644 --- a/samples/advanced_thread_trace/client.cpp +++ b/samples/advanced_thread_trace/client.cpp @@ -26,7 +26,7 @@ #endif /** - * @file samples/code_object_isa_decode/client.cpp + * @file samples/advanced_thread_trace/client.cpp * * @brief Example rocprofiler client (tool) */ @@ -140,9 +140,9 @@ struct ToolData std::mutex output_mut; CodeobjAddressTranslate codeobjTranslate; std::map> isa_map; - std::unordered_map kernels_in_codeobj = {}; - std::unordered_map kernel_object_to_kernel_name = {}; - int num_waves = 0; + std::unordered_map kernels_in_codeobj = {}; + std::unordered_map kernel_id_to_kernel_name = {}; + int num_waves = 0; std::ostream& output() { @@ -205,7 +205,7 @@ tool_codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record, { std::unique_lock lg(tool.isa_map_mut); auto* data = static_cast(record.payload); - tool.kernel_object_to_kernel_name.emplace(data->kernel_object, data->kernel_name); + tool.kernel_id_to_kernel_name.emplace(data->kernel_id, data->kernel_name); } if(record.operation != ROCPROFILER_CODE_OBJECT_LOAD) return; @@ -242,9 +242,8 @@ rocprofiler_att_control_flags_t dispatch_callback(rocprofiler_queue_id_t /* queue_id */, const rocprofiler_agent_t* /* agent */, rocprofiler_correlation_id_t /* correlation_id */, - const hsa_kernel_dispatch_packet_t* dispatch_packet, - uint64_t /* kernel_id */, - void* userdata) + rocprofiler_kernel_id_t kernel_id, + void* userdata) { C_API_BEGIN assert(userdata && "Dispatch callback passed null!"); @@ -252,22 +251,20 @@ dispatch_callback(rocprofiler_queue_id_t /* queue_id */, std::shared_lock lg(tool.isa_map_mut); - constexpr int desired_call_iteration = 1; static std::atomic call_id{0}; - static std::string_view desired_func_name = "transposeLdsSwapInplace"; + static std::string_view desired_func_name = "transposeLds"; try { - auto& kernel_name = tool.kernel_object_to_kernel_name.at(dispatch_packet->kernel_object); + auto& kernel_name = tool.kernel_id_to_kernel_name.at(kernel_id); if(kernel_name.find(desired_func_name) == std::string::npos) return ROCPROFILER_ATT_CONTROL_NONE; - if(call_id.fetch_add(1) == desired_call_iteration) - return ROCPROFILER_ATT_CONTROL_START_AND_STOP; + int id = call_id.fetch_add(1); + if(id == 1) return ROCPROFILER_ATT_CONTROL_START_AND_STOP; } catch(...) { - std::cerr << "Could not find kernel object: " << dispatch_packet->kernel_object - << std::endl; + std::cerr << "Could not find kernel id: " << kernel_id << std::endl; } C_API_END @@ -512,7 +509,7 @@ tool_fini(void* tool_data) << scalar_latency / float(scalar_exec) << " cycles.\n" << "Vector memory ops occupied: " << vmc_fraction << "% of cycles.\n" << "Scalar and LDS memory ops occupied: " << lgk_fraction << "% of cycles.\n" - << std::endl; + << "Num waves created: " << (tool.num_waves / 2) << std::endl; } } // namespace client diff --git a/samples/code_object_isa_decode/client.cpp b/samples/code_object_isa_decode/client.cpp index 43caaa97fd..663f9eee3d 100644 --- a/samples/code_object_isa_decode/client.cpp +++ b/samples/code_object_isa_decode/client.cpp @@ -130,7 +130,7 @@ tool_codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record, return; } - auto symbolmap = codeobjTranslate.getSymbolMap(data->code_object_id); + auto symbolmap = codeobjTranslate.getSymbolMap(); for(auto& [vaddr, symbol] : symbolmap) registered_kernels.insert({symbol.name, {vaddr, vaddr + symbol.mem_size}}); } @@ -160,14 +160,17 @@ tool_codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record, size_t vaddr = begin_end.first; while(vaddr < begin_end.second) { - auto inst = codeobjTranslate.get(vaddr); - std::string_view source = inst->comment; - if(source.rfind('/') < source.size()) source = source.substr(source.rfind('/')); - if(vaddr < begin_end.first + 64) output_stream() << '\t' << inst->inst << '\n'; + auto inst = codeobjTranslate.get(vaddr); + if(inst->comment.size()) + { + std::string_view source = inst->comment; + if(source.rfind('/') < source.size()) source = source.substr(source.rfind('/')); + if(vaddr < begin_end.first + 64) output_stream() << '\t' << inst->inst << '\n'; - if(source.rfind(':') < source.size()) source = source.substr(0, source.rfind(':')); + if(source.rfind(':') < source.size()) source = source.substr(0, source.rfind(':')); - references.insert(std::string(source)); + references.insert(std::string(source)); + } if(inst->inst.find("v_") == 0) num_vector++; else if(inst->inst.find("s_waitcnt") == 0) @@ -227,10 +230,8 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) } void -tool_fini(void* tool_data) -{ - (void) tool_data; -} +tool_fini(void* /* tool_data */) +{} } // namespace diff --git a/source/include/rocprofiler-sdk/amd_detail/rocprofiler-sdk-codeobj/code_printing.hpp b/source/include/rocprofiler-sdk/amd_detail/rocprofiler-sdk-codeobj/code_printing.hpp index c0859d5c68..0ed533894f 100644 --- a/source/include/rocprofiler-sdk/amd_detail/rocprofiler-sdk-codeobj/code_printing.hpp +++ b/source/include/rocprofiler-sdk/amd_detail/rocprofiler-sdk-codeobj/code_printing.hpp @@ -309,7 +309,8 @@ private: class CodeobjMap { public: - CodeobjMap() = default; + CodeobjMap() = default; + virtual ~CodeobjMap() = default; virtual void addDecoder(const char* filepath, codeobj_marker_id_t id, @@ -357,7 +358,8 @@ class CodeobjAddressTranslate : public CodeobjMap using Super = CodeobjMap; public: - CodeobjAddressTranslate() = default; + CodeobjAddressTranslate() = default; + ~CodeobjAddressTranslate() override = default; virtual void addDecoder(const char* filepath, codeobj_marker_id_t id, @@ -409,39 +411,31 @@ public: return nullptr; } - void getSymbolMap(const std::shared_ptr& dec, - std::unordered_map& symbols) const + std::map getSymbolMap() const { - try + std::map symbols; + + for(auto& [_, dec] : decoders) { auto& smap = dec->getSymbolMap(); for(auto& [vaddr, sym] : smap) symbols[vaddr + dec->load_addr] = sym; - } catch(std::exception& e) + } + + return symbols; + } + + std::map getSymbolMap(codeobj_marker_id_t id) const + { + if(decoders.find(id) == decoders.end()) return {}; + + try { - return; - }; - } - - std::unordered_map getSymbolMap() const - { - std::unordered_map symbols; - - for(auto& [_, dec] : decoders) - this->getSymbolMap(dec, symbols); - - return symbols; - } - - std::unordered_map getSymbolMap(codeobj_marker_id_t id) const - { - std::unordered_map symbols; - - auto it = decoders.find(id); - if(it == decoders.end()) return symbols; - - this->getSymbolMap(it->second, symbols); - return symbols; + return decoders.at(id)->getSymbolMap(); + } catch(...) + { + return {}; + } } private: diff --git a/source/include/rocprofiler-sdk/amd_detail/thread_trace.h b/source/include/rocprofiler-sdk/amd_detail/thread_trace.h index c961b828dc..5e769bb092 100644 --- a/source/include/rocprofiler-sdk/amd_detail/thread_trace.h +++ b/source/include/rocprofiler-sdk/amd_detail/thread_trace.h @@ -38,13 +38,11 @@ ROCPROFILER_EXTERN_C_INIT typedef enum { - ROCPROFILER_ATT_PARAMETER_TARGET_CU = 0, - ROCPROFILER_ATT_PARAMETER_SHADER_ENGINE_MASK, - ROCPROFILER_ATT_PARAMETER_BUFFER_SIZE, - ROCPROFILER_ATT_PARAMETER_SIMD_SELECT, - ROCPROFILER_ATT_PARAMETER_PERFCOUNTERS_CTRL, - ROCPROFILER_ATT_PARAMETER_PERFCOUNTER, - ROCPROFILER_ATT_PARAMETER_OCCUPANCY_MODE_ENABLE, + ROCPROFILER_ATT_PARAMETER_TARGET_CU = 0, ///< Select the Target CU or WGP + ROCPROFILER_ATT_PARAMETER_SHADER_ENGINE_MASK, ///< Bitmask of shader engines. + ROCPROFILER_ATT_PARAMETER_BUFFER_SIZE, ///< Size of combined GPU buffer for ATT + ROCPROFILER_ATT_PARAMETER_SIMD_SELECT, ///< Bitmask (GFX9) or ID (Navi) of SIMDs + ROCPROFILER_ATT_PARAMETER_CODE_OBJECT_TRACE_ENABLE, ///< Enables Codeobj Markers IDs into ATT ROCPROFILER_ATT_PARAMETER_LAST } rocprofiler_att_parameter_type_t; @@ -62,19 +60,37 @@ typedef enum ROCPROFILER_ATT_CONTROL_START_AND_STOP = 3 } rocprofiler_att_control_flags_t; +/** + * @brief Callback to be triggered every kernel dispatch, indicating to start and/or stop ATT + */ typedef rocprofiler_att_control_flags_t (*rocprofiler_att_dispatch_callback_t)( - rocprofiler_queue_id_t queue_id, - const rocprofiler_agent_t* agent, - rocprofiler_correlation_id_t correlation_id, - const hsa_kernel_dispatch_packet_t* dispatch_packet, - uint64_t kernel_id, - void* userdata); + rocprofiler_queue_id_t queue_id, + const rocprofiler_agent_t* agent, + rocprofiler_correlation_id_t correlation_id, + rocprofiler_kernel_id_t kernel_id, + void* userdata); +/** + * @brief Callback to be triggered every time some ATT data is generated by the device + * @param [in] shader_engine_id ID of shader engine, as enabled by SE_MASK + * @param [in] data Pointer to the buffer containing the ATT data + * @param [in] data_size Number of bytes in "data" + * @param [in] userdata Passed back to user + */ typedef void (*rocprofiler_att_shader_data_callback_t)(int64_t shader_engine_id, void* data, size_t data_size, void* userdata); +/** + * @brief Enables the advanced thread trace service. + * @param [in] context_id context_id. + * @param [in] parameters List of ATT-specific parameters. + * @param [in] num_parameters Number of parameters. Zero is allowed. + * @param [in] dispatch_callback Control fn which decides when ATT starts/stop collecting. + * @param [in] shader_callback Callback fn where the collected data will be sent to. + * @param [in] callback_userdata Passed back to user. + */ rocprofiler_status_t rocprofiler_configure_thread_trace_service(rocprofiler_context_id_t context_id, rocprofiler_att_parameter_t* parameters, diff --git a/source/lib/rocprofiler-sdk/aql/aql_profile_v2.h b/source/lib/rocprofiler-sdk/aql/aql_profile_v2.h index 2d6a6da1e6..6de750beb2 100644 --- a/source/lib/rocprofiler-sdk/aql/aql_profile_v2.h +++ b/source/lib/rocprofiler-sdk/aql/aql_profile_v2.h @@ -634,36 +634,32 @@ aqlprofile_att_parse_data(aqlprofile_att_se_data_callback_t se_data_callback, aqlprofile_att_isa_callback_t isa_callback, void* userdata); -/** - * @brief Contains flags for how code objects are interpreted - */ -typedef union +typedef struct { - struct - { - uint32_t isUnload : 1; // 0 if code object is being loaded, 1 for unload - uint32_t bFromStart : 1; // Has this code object been loaded before thread trace started? - uint32_t legacy_id : 30; // Legacy code object ID, if it fits in 30 bits. - }; - uint32_t raw; -} aqlprofile_att_header_marker_t; + uint64_t id; + uint64_t addr; + uint64_t size; + hsa_agent_t agent; + uint32_t isUnload : 1; + uint32_t fromStart : 1; +} aqlprofile_att_codeobj_data_t; /** * @brief Creates an AQL packet for marking code objects - * @param[out] packets Returned packet - * @param[in] handle The handle created from aqlprofile_att_create_packets() - * @param[in] header Header containing code object information created from profiler - * @param[in] id To be passed back to isa_string_callback in marker_id - * @param[in] addr Code object loaded address. - * @param[in] size Code object loaded size. + * @param[out] packet Returned packet + * @param[out] handle The handle created from aqlprofile_att_create_packets() + * @param[in] data Code object information + * @param[in] alloc_cb Callback to return both CPU and GPU accessible memory on demand + * @param[in] dealloc_cb Callback to free data allocated by alloc_cb() + * @param[in] userdata Userdata to be passed back to memory callbacks */ hsa_status_t -aqlprofile_att_codeobj_load_marker(hsa_ext_amd_aql_pm4_packet_t* packets, - aqlprofile_handle_t handle, - aqlprofile_att_header_marker_t header, - uint64_t id, - uint64_t addr, - uint64_t size); +aqlprofile_att_codeobj_marker(hsa_ext_amd_aql_pm4_packet_t* packet, + aqlprofile_handle_t* handle, + aqlprofile_att_codeobj_data_t data, + aqlprofile_memory_alloc_callback_t alloc_cb, + aqlprofile_memory_dealloc_callback_t dealloc_cb, + void* userdata); #ifdef __cplusplus } diff --git a/source/lib/rocprofiler-sdk/aql/packet_construct.cpp b/source/lib/rocprofiler-sdk/aql/packet_construct.cpp index 1ee3db54c6..47e53af44f 100644 --- a/source/lib/rocprofiler-sdk/aql/packet_construct.cpp +++ b/source/lib/rocprofiler-sdk/aql/packet_construct.cpp @@ -198,65 +198,54 @@ CounterPacketConstruct::construct_packet(const AmdExtTable& ext) return pkt_ptr; } -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wnarrowing" - -ThreadTraceAQLPacketFactory::ThreadTraceAQLPacketFactory( - const hsa::AgentCache& agent, - std::shared_ptr& params, - const CoreApiTable& coreapi, - const AmdExtTable& ext) +ThreadTraceAQLPacketFactory::ThreadTraceAQLPacketFactory(const hsa::AgentCache& agent, + const thread_trace_parameter_pack& params, + const CoreApiTable& coreapi, + const AmdExtTable& ext) { - this->tracepool = std::make_shared(); - this->tracepool->allocate_fn = ext.hsa_amd_memory_pool_allocate_fn; - this->tracepool->allow_access_fn = ext.hsa_amd_agents_allow_access_fn; - this->tracepool->free_fn = ext.hsa_amd_memory_pool_free_fn; - this->tracepool->api_copy_fn = coreapi.hsa_memory_copy_fn; - this->tracepool->gpu_agent = agent.get_hsa_agent(); - this->tracepool->cpu_pool_ = agent.cpu_pool(); - this->tracepool->gpu_pool_ = agent.gpu_pool(); + this->tracepool = hsa::TraceMemoryPool{}; + this->tracepool.allocate_fn = ext.hsa_amd_memory_pool_allocate_fn; + this->tracepool.allow_access_fn = ext.hsa_amd_agents_allow_access_fn; + this->tracepool.free_fn = ext.hsa_amd_memory_pool_free_fn; + this->tracepool.api_copy_fn = coreapi.hsa_memory_copy_fn; + this->tracepool.gpu_agent = agent.get_hsa_agent(); + this->tracepool.cpu_pool_ = agent.cpu_pool(); + this->tracepool.gpu_pool_ = agent.gpu_pool(); - this->aql_params.clear(); - auto& p = this->aql_params; - p.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET, params->target_cu}); - p.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK, params->shader_engine_mask}); - p.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SIMD_SELECTION, params->simd_select}); - p.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE, params->buffer_size}); + uint32_t cu = static_cast(params.target_cu); + uint32_t shader_engine_mask = static_cast(params.shader_engine_mask); + uint32_t simd = static_cast(params.simd_select); + uint32_t buffer_size = static_cast(params.buffer_size); - this->profile = aqlprofile_att_profile_t{agent.get_hsa_agent(), p.data(), p.size()}; + aql_params.clear(); + aql_params.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET, cu}); + aql_params.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK, shader_engine_mask}); + aql_params.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SIMD_SELECTION, simd}); + aql_params.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE, buffer_size}); } -#pragma GCC diagnostic pop - -std::unique_ptr +std::unique_ptr ThreadTraceAQLPacketFactory::construct_packet() { - auto packet = std::make_unique(this->tracepool); - hsa_status_t _status = aqlprofile_att_create_packets(&packet->handle, - &packet->packets, - this->profile, - &hsa::TraceAQLPacket::Alloc, - &hsa::TraceAQLPacket::Free, - &hsa::TraceAQLPacket::Copy, - packet.get()); - CHECK_HSA(_status, "failed to create ATT packet"); - - packet->before_krn_pkt.clear(); - packet->after_krn_pkt.clear(); - packet->packets.start_packet.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE; - packet->packets.stop_packet.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE; - packet->packets.start_packet.completion_signal = hsa_signal_t{.handle = 0}; - packet->packets.stop_packet.completion_signal = hsa_signal_t{.handle = 0}; - - packet->empty = false; - packet->start = packet->packets.start_packet; - packet->stop = packet->packets.stop_packet; - packet->before_krn_pkt.push_back(packet->start); - packet->after_krn_pkt.push_back(packet->stop); - + uint32_t num_params = static_cast(aql_params.size()); + auto profile = aqlprofile_att_profile_t{tracepool.gpu_agent, aql_params.data(), num_params}; + auto packet = std::make_unique(this->tracepool, profile); + packet->clear(); return packet; } +std::unique_ptr +ThreadTraceAQLPacketFactory::construct_load_marker_packet(uint64_t id, uint64_t addr, uint64_t size) +{ + return std::make_unique(tracepool, id, addr, size, false, false); +} + +std::unique_ptr +ThreadTraceAQLPacketFactory::construct_unload_marker_packet(uint64_t id) +{ + return std::make_unique(tracepool, id, 0, 0, false, true); +} + std::vector CounterPacketConstruct::get_all_events() const { diff --git a/source/lib/rocprofiler-sdk/aql/packet_construct.hpp b/source/lib/rocprofiler-sdk/aql/packet_construct.hpp index 6f46c13bc3..058e0bb6da 100644 --- a/source/lib/rocprofiler-sdk/aql/packet_construct.hpp +++ b/source/lib/rocprofiler-sdk/aql/packet_construct.hpp @@ -87,16 +87,19 @@ protected: class ThreadTraceAQLPacketFactory { public: - ThreadTraceAQLPacketFactory(const hsa::AgentCache& agent, - std::shared_ptr& params, - const CoreApiTable& coreapi, - const AmdExtTable& ext); - std::unique_ptr construct_packet(); + ThreadTraceAQLPacketFactory(const hsa::AgentCache& agent, + const thread_trace_parameter_pack& params, + const CoreApiTable& coreapi, + const AmdExtTable& ext); + std::unique_ptr construct_packet(); + std::unique_ptr construct_load_marker_packet(uint64_t id, + uint64_t addr, + uint64_t size); + std::unique_ptr construct_unload_marker_packet(uint64_t id); private: - std::shared_ptr tracepool; + hsa::TraceMemoryPool tracepool; std::vector aql_params; - aqlprofile_att_profile_t profile; }; } // namespace aql diff --git a/source/lib/rocprofiler-sdk/context/context.hpp b/source/lib/rocprofiler-sdk/context/context.hpp index e23a4ba616..a18749b985 100644 --- a/source/lib/rocprofiler-sdk/context/context.hpp +++ b/source/lib/rocprofiler-sdk/context/context.hpp @@ -137,8 +137,9 @@ struct context // Only one of counter collection/agent counter collection can exists in the ctx. std::unique_ptr counter_collection = {}; std::unique_ptr agent_counter_collection = {}; - std::shared_ptr thread_trace = {}; std::unique_ptr pc_sampler = {}; + // TODO: Make a unique pointer instead + std::shared_ptr thread_trace = {}; }; // set the client index needs to be called before allocate_context() diff --git a/source/lib/rocprofiler-sdk/hsa/aql_packet.cpp b/source/lib/rocprofiler-sdk/hsa/aql_packet.cpp index ec076b4011..dea20a56a2 100644 --- a/source/lib/rocprofiler-sdk/hsa/aql_packet.cpp +++ b/source/lib/rocprofiler-sdk/hsa/aql_packet.cpp @@ -23,11 +23,12 @@ #include "lib/rocprofiler-sdk/hsa/aql_packet.hpp" #include #include +#include "lib/common/logging.hpp" #define CHECK_HSA(fn, message) \ if((fn) != HSA_STATUS_SUCCESS) \ { \ - std::cerr << __FILE__ << ':' << __LINE__ << ' ' << message; \ + ROCP_ERROR << message; \ exit(1); \ } @@ -64,47 +65,96 @@ CounterAQLPacket::~CounterAQLPacket() } } -TraceAQLPacket::~TraceAQLPacket() { aqlprofile_att_delete_packets(this->handle); } - hsa_status_t -TraceAQLPacket::Alloc(void** ptr, size_t size, aqlprofile_buffer_desc_flags_t flags, void* data) +BaseTTAQLPacket::Alloc(void** ptr, size_t size, desc_t flags, void* data) { if(!data) return HSA_STATUS_ERROR; - if(!reinterpret_cast(data)->tracepool) return HSA_STATUS_ERROR; - - auto& pool = *reinterpret_cast(data)->tracepool; + auto& pool = reinterpret_cast(data)->tracepool; if(!pool.allocate_fn || !pool.free_fn || !pool.allow_access_fn) return HSA_STATUS_ERROR; + hsa_status_t status = HSA_STATUS_ERROR; if(flags.host_access) { - hsa_status_t status = pool.allocate_fn(pool.cpu_pool_, size, 0, ptr); - if(!flags.device_access || status != HSA_STATUS_SUCCESS) return status; - return pool.allow_access_fn(1, &pool.gpu_agent, nullptr, *ptr); + status = pool.allocate_fn(pool.cpu_pool_, size, 0, ptr); + + if(status == HSA_STATUS_SUCCESS) + status = pool.allow_access_fn(1, &pool.gpu_agent, nullptr, *ptr); } - return pool.allocate_fn(pool.gpu_pool_, size, 0, ptr); + else + { + // Return page aligned data to avoid cache flush overlap + status = pool.allocate_fn(pool.gpu_pool_, size + 0x2000, 0, ptr); + *ptr = (void*) ((uintptr_t(*ptr) + 0xFFF) & ~0xFFFul); // NOLINT + } + return status; } void -TraceAQLPacket::Free(void* ptr, void* data) +BaseTTAQLPacket::Free(void* ptr, void* data) { - auto* pool = reinterpret_cast(data)->tracepool.get(); - if(!pool || !pool->free_fn) return; + assert(data); + auto& pool = reinterpret_cast(data)->tracepool; - pool->free_fn(ptr); + if(pool.free_fn) pool.free_fn(ptr); } hsa_status_t -TraceAQLPacket::Copy(void* dst, const void* src, size_t size, void* data) +BaseTTAQLPacket::Copy(void* dst, const void* src, size_t size, void* data) { - auto* pool = reinterpret_cast(data)->tracepool.get(); - if(!pool || !pool->api_copy_fn) return HSA_STATUS_ERROR; + if(!data) return HSA_STATUS_ERROR; + auto& pool = reinterpret_cast(data)->tracepool; - return pool->api_copy_fn(dst, src, size); + if(!pool.api_copy_fn) return HSA_STATUS_ERROR; + + return pool.api_copy_fn(dst, src, size); } -TraceAQLPacket::TraceAQLPacket(std::shared_ptr& _tracepool) -: tracepool(_tracepool){}; +TraceControlAQLPacket::TraceControlAQLPacket(const TraceMemoryPool& _tracepool, + const aqlprofile_att_profile_t& p) +: BaseTTAQLPacket(_tracepool) +{ + auto status = aqlprofile_att_create_packets(&handle, &packets, p, &Alloc, &Free, &Copy, this); + CHECK_HSA(status, "failed to create ATT packet"); + + packets.start_packet.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE; + packets.stop_packet.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE; + packets.start_packet.completion_signal = hsa_signal_t{.handle = 0}; + packets.stop_packet.completion_signal = hsa_signal_t{.handle = 0}; + this->empty = false; +}; + +void +TraceControlAQLPacket::populate_before() +{ + before_krn_pkt.push_back(packets.start_packet); + for(auto& [_, codeobj] : loaded_codeobj) + if(codeobj) before_krn_pkt.push_back(codeobj->packet); +}; + +CodeobjMarkerAQLPacket::CodeobjMarkerAQLPacket(const TraceMemoryPool& _tracepool, + uint64_t id, + uint64_t addr, + uint64_t size, + bool bFromStart, + bool bIsUnload) +: BaseTTAQLPacket(_tracepool) +{ + aqlprofile_att_codeobj_data_t codeobj{}; + codeobj.id = id; + codeobj.addr = addr; + codeobj.size = size; + codeobj.agent = _tracepool.gpu_agent; + codeobj.isUnload = bIsUnload; + codeobj.fromStart = bFromStart; + + auto status = aqlprofile_att_codeobj_marker(&packet, &handle, codeobj, &Alloc, &Free, this); + CHECK_HSA(status, "failed to create ATT marker"); + + packet.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE; + packet.completion_signal = hsa_signal_t{.handle = 0}; + this->empty = false; +} } // namespace hsa } // namespace rocprofiler diff --git a/source/lib/rocprofiler-sdk/hsa/aql_packet.hpp b/source/lib/rocprofiler-sdk/hsa/aql_packet.hpp index 3235a20158..8136e87697 100644 --- a/source/lib/rocprofiler-sdk/hsa/aql_packet.hpp +++ b/source/lib/rocprofiler-sdk/hsa/aql_packet.hpp @@ -61,10 +61,16 @@ public: AQLPacket(const AQLPacket&) = delete; AQLPacket& operator=(const AQLPacket&) = delete; - aqlprofile_handle_t pkt_handle = {.handle = 0}; - aqlprofile_pmc_aql_packets_t pkts = {.start_packet = null_amd_aql_pm4_packet, - .stop_packet = null_amd_aql_pm4_packet, - .read_packet = null_amd_aql_pm4_packet}; + void clear() + { + before_krn_pkt.clear(); + after_krn_pkt.clear(); + } + + virtual void populate_before() = 0; + virtual void populate_after() = 0; + + aqlprofile_handle_t pkt_handle = {.handle = 0}; bool empty = {true}; hsa_ven_amd_aqlprofile_profile_t profile = {}; @@ -87,6 +93,13 @@ public: : free_func{func} {}; ~CounterAQLPacket() override; + void populate_before() override { before_krn_pkt.push_back(start); }; + void populate_after() override + { + after_krn_pkt.push_back(stop); + after_krn_pkt.push_back(read); + }; + protected: bool command_buf_mallocd = false; bool output_buffer_malloced = false; @@ -104,29 +117,73 @@ struct TraceMemoryPool decltype(hsa_memory_copy)* api_copy_fn; }; -class TraceAQLPacket : public AQLPacket +class BaseTTAQLPacket : public AQLPacket { friend class rocprofiler::aql::ThreadTraceAQLPacketFactory; +protected: + using desc_t = aqlprofile_buffer_desc_flags_t; + public: - TraceAQLPacket(std::shared_ptr& _tracepool); - TraceMemoryPool& GetPool() const { return *tracepool; } + BaseTTAQLPacket(const TraceMemoryPool& _tracepool) + : tracepool(_tracepool){}; + ~BaseTTAQLPacket() override { aqlprofile_att_delete_packets(this->handle); }; + aqlprofile_handle_t GetHandle() const { return handle; } - uint64_t GetAgent() const { return tracepool->gpu_agent.handle; } - ~TraceAQLPacket() override; + hsa_agent_t GetAgent() const { return tracepool.gpu_agent; } protected: - std::shared_ptr tracepool; - aqlprofile_att_control_aql_packets_t packets; - aqlprofile_handle_t handle; + TraceMemoryPool tracepool; + aqlprofile_handle_t handle; - static hsa_status_t Alloc(void** ptr, - size_t size, - aqlprofile_buffer_desc_flags_t flags, - void* data); + static hsa_status_t Alloc(void** ptr, size_t size, desc_t flags, void* data); static void Free(void* ptr, void* data); static hsa_status_t Copy(void* dst, const void* src, size_t size, void* data); }; +class CodeobjMarkerAQLPacket : public BaseTTAQLPacket +{ + friend class rocprofiler::aql::ThreadTraceAQLPacketFactory; + +public: + CodeobjMarkerAQLPacket(const TraceMemoryPool& tracepool, + uint64_t id, + uint64_t addr, + uint64_t size, + bool bFromStart, + bool bIsUnload); + ~CodeobjMarkerAQLPacket() override = default; + + void populate_before() override { before_krn_pkt.push_back(packet); }; + void populate_after() override{}; + + hsa_ext_amd_aql_pm4_packet_t packet; +}; + +class TraceControlAQLPacket : public BaseTTAQLPacket +{ + friend class rocprofiler::aql::ThreadTraceAQLPacketFactory; + using code_object_id_t = uint64_t; + +public: + TraceControlAQLPacket(const TraceMemoryPool& tracepool, + const aqlprofile_att_profile_t& profile); + ~TraceControlAQLPacket() override = default; + + void add_codeobj(code_object_id_t id, uint64_t addr, uint64_t size) + { + loaded_codeobj[id] = + std::make_unique(tracepool, id, addr, size, true, false); + } + void remove_codeobj(code_object_id_t id) { loaded_codeobj.erase(id); } + + void populate_before() override; + void populate_after() override { after_krn_pkt.push_back(packets.stop_packet); } + +private: + aqlprofile_att_control_aql_packets_t packets; + std::unordered_map> loaded_codeobj; +}; + } // namespace hsa } // namespace rocprofiler diff --git a/source/lib/rocprofiler-sdk/hsa/queue_controller.cpp b/source/lib/rocprofiler-sdk/hsa/queue_controller.cpp index f83773ac81..b966f9e6d6 100644 --- a/source/lib/rocprofiler-sdk/hsa/queue_controller.cpp +++ b/source/lib/rocprofiler-sdk/hsa/queue_controller.cpp @@ -257,7 +257,7 @@ QueueController::init(CoreApiTable& core_table, AmdExtTable& ext_table) constexpr auto expected_context_size = 200UL; static_assert( sizeof(context::context) == - expected_context_size + sizeof(std::shared_ptr), + expected_context_size + sizeof(std::shared_ptr), "If you added a new field to context struct, make sure there is a check here if it " "requires queue interception. Once you have done so, increment expected_context_size"); @@ -274,8 +274,10 @@ QueueController::init(CoreApiTable& core_table, AmdExtTable& ext_table) } else if(itr->thread_trace) { - enable_intercepter = true; - std::weak_ptr trace = itr->thread_trace; + enable_intercepter = true; + std::weak_ptr trace = itr->thread_trace; + + // TODO: Make it wrapper on HSA initialization pre_initialize.emplace_back( [trace](const AgentCache& cache, const CoreApiTable& core, const AmdExtTable& ext) { if(auto locked = trace.lock()) locked->resource_init(cache, core, ext); diff --git a/source/lib/rocprofiler-sdk/thread_trace/att_core.cpp b/source/lib/rocprofiler-sdk/thread_trace/att_core.cpp index ade1400c6a..431059d83d 100644 --- a/source/lib/rocprofiler-sdk/thread_trace/att_core.cpp +++ b/source/lib/rocprofiler-sdk/thread_trace/att_core.cpp @@ -51,146 +51,402 @@ } \ } +constexpr size_t ROCPROFILER_QUEUE_SIZE = 64; + namespace rocprofiler { -using AQLPacketOwner = std::unique_ptr; -using inst_pkt_t = common::container::small_vector, 4>; -using corr_id_map_t = hsa::Queue::queue_info_session_t::external_corr_id_map_t; - struct cbdata_t { void* tool_userdata; rocprofiler_att_shader_data_callback_t cb_fn; - std::vector* memory_space; + rocprofiler_correlation_id_t corr_id; }; +common::Synchronized> client; + +bool +AgentThreadTracer::Submit(hsa_ext_amd_aql_pm4_packet_t* packet) +{ + const uint64_t write_idx = add_write_index_relaxed_fn(queue, 1); + + size_t index = (write_idx % queue->size) * sizeof(hsa_ext_amd_aql_pm4_packet_t); + auto* queue_slot = reinterpret_cast(size_t(queue->base_address) + index); // NOLINT + + const auto* slot_data = reinterpret_cast(packet); + + memcpy(&queue_slot[1], &slot_data[1], sizeof(hsa_ext_amd_aql_pm4_packet_t) - sizeof(uint32_t)); + auto* header = reinterpret_cast*>(queue_slot); + + header->store(slot_data[0], std::memory_order_release); + signal_store_screlease_fn(queue->doorbell_signal, write_idx); + + int loops = 0; + while(load_read_index_relaxed_fn(queue) <= write_idx) + { + loops++; + usleep(1); + if(loops > 10000) // Add loop limit to prevent hang. TODO: Remove once stability proven + { + ROCP_ERROR << "Codeobj packet submission failed!"; + return false; + } + } + return true; +} + +AgentThreadTracer::AgentThreadTracer(thread_trace_parameter_pack _params, + const hsa::AgentCache& cache, + const CoreApiTable& coreapi, + const AmdExtTable& ext) +: params(std::move(_params)) +{ + factory = std::make_unique(cache, this->params, coreapi, ext); + cached_resources = factory->construct_packet(); + + auto status = coreapi.hsa_queue_create_fn(cache.get_hsa_agent(), + ROCPROFILER_QUEUE_SIZE, + HSA_QUEUE_TYPE_SINGLE, + nullptr, + nullptr, + UINT32_MAX, + UINT32_MAX, + &this->queue); + if(status != HSA_STATUS_SUCCESS) + { + ROCP_ERROR << "Failed to create thread trace async queue"; + this->queue = nullptr; + } + + queue_destroy_fn = coreapi.hsa_queue_destroy_fn; + signal_store_screlease_fn = coreapi.hsa_signal_store_screlease_fn; + add_write_index_relaxed_fn = coreapi.hsa_queue_add_write_index_relaxed_fn; + load_read_index_relaxed_fn = coreapi.hsa_queue_load_read_index_relaxed_fn; +} + +AgentThreadTracer::~AgentThreadTracer() +{ + std::unique_lock lk(trace_resources_mut); + + if(active_resources.packet != nullptr) + ROCP_WARNING << "Thread tracer being destroyed with thread trace active"; + + if(!this->queue) return; + + auto* packet = static_cast(active_resources.packet.get()); + if(packet) + { + packet->clear(); + packet->populate_after(); + + for(auto& after_packet : packet->after_krn_pkt) + Submit(&after_packet); + } + + if(queue_destroy_fn) queue_destroy_fn(this->queue); +} + /** * Callback we get from HSA interceptor when a kernel packet is being enqueued. * We return an AQLPacket containing the start/stop/read packets for injection. */ -AQLPacketOwner -pre_kernel_call(ThreadTracer& tracer, - const hsa::Queue& queue, - const hsa::rocprofiler_packet& kern_pkt, - uint64_t kernel_id, - const corr_id_map_t& extern_corr_ids, - const context::correlation_id* corr_id) +std::unique_ptr +AgentThreadTracer::pre_kernel_call(rocprofiler_att_control_flags_t control_flags, + rocprofiler_queue_id_t queue_id, + rocprofiler_correlation_id_t corr_id) { - (void) extern_corr_ids; - (void) corr_id; - - rocprofiler_correlation_id_t temp_corr_id; - temp_corr_id.internal = 0; - temp_corr_id.external.value = 0; - temp_corr_id.external.ptr = nullptr; - - auto control_flags = tracer.params->dispatch_cb_fn(queue.get_id(), - queue.get_agent().get_rocp_agent(), - temp_corr_id, - &kern_pkt.kernel_dispatch, - kernel_id, - tracer.params->callback_userdata); - if(control_flags == ROCPROFILER_ATT_CONTROL_NONE) return nullptr; - assert(control_flags == ROCPROFILER_ATT_CONTROL_START_AND_STOP && "Error: Not implemented"); + std::unique_lock lk(trace_resources_mut); - uint64_t agent = queue.get_agent().get_hsa_agent().handle; - std::lock_guard lk(tracer.trace_resources_mut); + if(control_flags == ROCPROFILER_ATT_CONTROL_STOP) + { + if(active_resources.packet == nullptr) + { + ROCP_ERROR << "Attempt at stopping a thread trace that has not started!\n"; + return nullptr; + } - try + active_resources.packet->clear(); + active_resources.packet->populate_after(); + data_is_ready.fetch_add(1); + return std::move(active_resources.packet); + } + + if(active_resources.packet != nullptr) { - auto moved = std::move(tracer.resources.at(agent)); - tracer.resources.erase(agent); - return moved; - } catch(std::out_of_range& e) - { - ROCP_WARNING << "Attempt to initialize ATT without allocated resources!\n"; + ROCP_ERROR << "Attempt at starting a thread trace while another was active!\n"; return nullptr; } + else + { + active_resources.corr_id = corr_id; + active_resources.queue_id = queue_id; + } + + if(cached_resources == nullptr) + { + ROCP_ERROR << "Attempt to initialize ATT without allocated resources!\n"; + return nullptr; + } + + cached_resources->clear(); + cached_resources->populate_before(); + + if((control_flags & ROCPROFILER_ATT_CONTROL_STOP) != 0) + { + cached_resources->populate_after(); + data_is_ready.fetch_add(1); + } + + return std::move(cached_resources); } hsa_status_t thread_trace_callback(uint32_t shader, void* buffer, uint64_t size, void* callback_data) { - void* tool_userdata = static_cast(callback_data)->tool_userdata; - auto callback_fn = *static_cast(callback_data)->cb_fn; - std::vector& cpu_data = *static_cast(callback_data)->memory_space; + void* tool_userdata = static_cast(callback_data)->tool_userdata; + auto callback_fn = *static_cast(callback_data)->cb_fn; - // TODO(gbaraldi): Handle parallel callbacks - static std::mutex mut; - std::lock_guard lk(mut); - - if(size > cpu_data.size()) cpu_data.resize(size + cpu_data.size()); - - auto status = hsa::get_queue_controller()->get_core_table().hsa_memory_copy_fn( - cpu_data.data(), buffer, size); - if(status != HSA_STATUS_SUCCESS) - { - ROCP_WARNING << "Failed to copy hsa memory!"; - return HSA_STATUS_SUCCESS; - } - - callback_fn(shader, cpu_data.data(), size, tool_userdata); + callback_fn(shader, buffer, size, tool_userdata); return HSA_STATUS_SUCCESS; } void -post_kernel_call(ThreadTracer& tracer, inst_pkt_t& aql) +AgentThreadTracer::post_kernel_call(std::unique_ptr&& aql) { - std::vector cpu_data{}; - auto pair = cbdata_t{tracer.params->callback_userdata, tracer.params->shader_cb_fn, &cpu_data}; + std::unique_lock lk(trace_resources_mut); + + active_resources.packet = std::move(aql); + + if(!active_resources.packet || data_is_ready.load() < 1) return; + auto* pkt = static_cast(active_resources.packet.get()); + + for(auto& record : remaining_codeobj_record) + { + if(!record.bUnload) + pkt->add_codeobj(record.id, record.addr, record.size); + else + pkt->remove_codeobj(record.id); + } + remaining_codeobj_record.clear(); + + cbdata_t cb_dt{}; + + cb_dt.corr_id = active_resources.corr_id; + cb_dt.tool_userdata = params.callback_userdata; + cb_dt.cb_fn = params.shader_cb_fn; + + auto status = aqlprofile_att_iterate_data(pkt->GetHandle(), thread_trace_callback, &cb_dt); + CHECK_HSA(status, "Failed to iterate ATT data"); + + data_is_ready.fetch_sub(1); + cached_resources = std::move(active_resources.packet); +} + +void +AgentThreadTracer::load_codeobj(code_object_id_t id, uint64_t addr, uint64_t size) +{ + std::unique_lock lk(trace_resources_mut); + + if(auto* pkt = static_cast(cached_resources.get())) + { + pkt->add_codeobj(id, addr, size); + return; + } + + remaining_codeobj_record.push_back({id, addr, size, false}); + + if(!queue) return; + + auto packet = factory->construct_load_marker_packet(id, addr, size); + bool bSuccess = Submit(&packet->packet); + + if(!bSuccess) // If something went wrong, don't delete packet to avoid CP memory access fault + packet.release(); +} + +void +AgentThreadTracer::unload_codeobj(code_object_id_t id) +{ + std::unique_lock lk(trace_resources_mut); + + if(auto* pkt = static_cast(cached_resources.get())) + { + pkt->remove_codeobj(id); + return; + } + + remaining_codeobj_record.push_back({id, 0, 0, true}); + + if(!queue) return; + + auto packet = factory->construct_unload_marker_packet(id); + bool bSuccess = Submit(&packet->packet); + + if(!bSuccess) // If something went wrong, don't delete packet to avoid CP memory access fault + packet.release(); +} + +// TODO: make this a wrapper on HSA load instead of registering +void +GlobalThreadTracer::codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* /* user_data */, + void* callback_data) +{ + if(!callback_data) return; + if(record.kind != ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT) return; + if(record.operation != ROCPROFILER_CODE_OBJECT_LOAD) return; + + auto* rec = static_cast(record.payload); + assert(rec); + + GlobalThreadTracer& tracer = *static_cast(callback_data); + auto agent = rec->hsa_agent; + + std::shared_lock lk(tracer.agents_map_mut); + + if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD) + { + try + { + tracer.loaded_codeobjs.at(rec->hsa_agent).erase(rec->code_object_id); + } catch(std::exception& e) + { + ROCP_WARNING << "Codeobj unload called for invalid ID " << rec->code_object_id; + } + } + else + { + tracer.loaded_codeobjs[agent][rec->code_object_id] = {rec->load_delta, rec->load_size}; + } + + auto tracer_it = tracer.agents.find(agent); + if(tracer_it == tracer.agents.end()) return; + + if(record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD) + tracer_it->second->load_codeobj(rec->code_object_id, rec->load_delta, rec->load_size); + else if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD) + tracer_it->second->unload_codeobj(rec->code_object_id); +} + +void +GlobalThreadTracer::resource_init(const hsa::AgentCache& cache, + const CoreApiTable& coreapi, + const AmdExtTable& ext) +{ + auto agent = cache.get_hsa_agent(); + std::unique_lock lk(agents_map_mut); + + auto agent_it = agents.find(agent); + if(agent_it != agents.end()) + { + agent_it->second->active_queues.fetch_add(1); + return; + } + + auto new_tracer = std::make_unique(this->params, cache, coreapi, ext); + new_tracer->active_queues.store(1); + agents.emplace(agent, std::move(new_tracer)); +} + +void +GlobalThreadTracer::resource_deinit(const hsa::AgentCache& cache) +{ + std::unique_lock lk(agents_map_mut); + + auto agent_it = agents.find(cache.get_hsa_agent()); + if(agent_it == agents.end()) return; + + if(agent_it->second->active_queues.fetch_sub(1) > 1) return; + + agents.erase(cache.get_hsa_agent()); +} + +/** + * Callback we get from HSA interceptor when a kernel packet is being enqueued. + * We return an AQLPacket containing the start/stop/read packets for injection. + */ +std::unique_ptr +GlobalThreadTracer::pre_kernel_call(const hsa::Queue& queue, + rocprofiler_kernel_id_t kernel_id, + const context::correlation_id* corr_id) +{ + rocprofiler_correlation_id_t rocprof_corr_id = + rocprofiler_correlation_id_t{.internal = 0, .external = context::null_user_data}; + + if(corr_id) rocprof_corr_id.internal = corr_id->internal; + // TODO: Get external + + auto control_flags = params.dispatch_cb_fn(queue.get_id(), + queue.get_agent().get_rocp_agent(), + rocprof_corr_id, + kernel_id, + params.callback_userdata); + + if(control_flags == ROCPROFILER_ATT_CONTROL_NONE) return nullptr; + + std::shared_lock lk(agents_map_mut); + + auto it = agents.find(queue.get_agent().get_hsa_agent()); + assert(it != agents.end() && it->second != nullptr); + + auto packet = it->second->pre_kernel_call(control_flags, queue.get_id(), rocprof_corr_id); + if(packet != nullptr) post_move_data.fetch_add(1); + return packet; +} + +void +GlobalThreadTracer::post_kernel_call(GlobalThreadTracer::inst_pkt_t& aql) +{ + if(post_move_data.load() < 1) return; for(auto& aql_pkt : aql) { - auto* pkt = dynamic_cast(aql_pkt.first.get()); + auto* pkt = dynamic_cast(aql_pkt.first.get()); if(!pkt) continue; - auto status = aqlprofile_att_iterate_data(pkt->GetHandle(), thread_trace_callback, &pair); - CHECK_HSA(status, "Failed to iterate ATT data"); + std::shared_lock lk(agents_map_mut); + post_move_data.fetch_sub(1); - std::lock_guard lk(tracer.trace_resources_mut); - if(tracer.agent_active_queues.find(pkt->GetAgent()) != tracer.agent_active_queues.end()) - tracer.resources[pkt->GetAgent()] = std::move(aql_pkt.first); + auto it = agents.find(pkt->GetAgent()); + if(it != agents.end() && it->second != nullptr) + it->second->post_kernel_call(std::move(aql_pkt.first)); } } -common::Synchronized> client; - void -ThreadTracer::start_context() +GlobalThreadTracer::start_context() { + if(codeobj_client_ctx.handle != 0) + { + auto status = rocprofiler_start_context(codeobj_client_ctx); + if(status != ROCPROFILER_STATUS_SUCCESS) throw std::exception(); + } + // Only one thread should be attempting to enable/disable this context client.wlock([&](auto& client_id) { if(client_id) return; client_id = hsa::get_queue_controller()->add_callback( std::nullopt, - [=](const hsa::Queue& q, - const hsa::rocprofiler_packet& kern_pkt, - rocprofiler_kernel_id_t kernel_id, - rocprofiler_dispatch_id_t dispatch_id, - rocprofiler_user_data_t* user_data, - const hsa::Queue::queue_info_session_t::external_corr_id_map_t& extern_corr_ids, - const context::correlation_id* corr_id) { - return pre_kernel_call(*this, q, kern_pkt, kernel_id, extern_corr_ids, corr_id); - (void) user_data; - (void) dispatch_id; + [=](const hsa::Queue& q, + const hsa::rocprofiler_packet& /* kern_pkt */, + rocprofiler_kernel_id_t kernel_id, + rocprofiler_dispatch_id_t /* dispatch_id */, + rocprofiler_user_data_t* /* user_data */, + const corr_id_map_t& /* extern_corr_ids */, + const context::correlation_id* corr_id) { + return this->pre_kernel_call(q, kernel_id, corr_id); }, - [=](const hsa::Queue& q, - hsa::rocprofiler_packet kern_pkt, - const hsa::Queue::queue_info_session_t& session, - inst_pkt_t& aql) { - post_kernel_call(*this, aql); - (void) session; - (void) kern_pkt; - (void) q; - }); + [=](const hsa::Queue& /* q */, + hsa::rocprofiler_packet /* kern_pkt */, + const hsa::Queue::queue_info_session_t& /* session */, + inst_pkt_t& aql) { this->post_kernel_call(aql); }); }); } void -ThreadTracer::stop_context() +GlobalThreadTracer::stop_context() { client.wlock([&](auto& client_id) { if(!client_id) return; @@ -201,39 +457,4 @@ ThreadTracer::stop_context() }); } -void -ThreadTracer::resource_init(const hsa::AgentCache& cache, - const CoreApiTable& coreapi, - const AmdExtTable& ext) -{ - uint64_t agent = cache.get_hsa_agent().handle; - std::lock_guard lk(trace_resources_mut); - - if(agent_active_queues.find(agent) != agent_active_queues.end()) - { - agent_active_queues.at(agent).fetch_add(1); - return; - } - - auto factory = aql::ThreadTraceAQLPacketFactory(cache, this->params, coreapi, ext); - resources[agent] = factory.construct_packet(); - agent_active_queues[agent] = 1; -} - -void -ThreadTracer::resource_deinit(const hsa::AgentCache& cache) -{ - uint64_t agent = cache.get_hsa_agent().handle; - std::lock_guard lk(trace_resources_mut); - - try - { - if(agent_active_queues.at(agent).fetch_add(-1) > 1) return; - } catch(std::out_of_range&) - {} - - agent_active_queues.erase(agent); - resources.erase(agent); -} - } // namespace rocprofiler diff --git a/source/lib/rocprofiler-sdk/thread_trace/att_core.hpp b/source/lib/rocprofiler-sdk/thread_trace/att_core.hpp index d46fd7268f..91428b9571 100644 --- a/source/lib/rocprofiler-sdk/thread_trace/att_core.hpp +++ b/source/lib/rocprofiler-sdk/thread_trace/att_core.hpp @@ -22,6 +22,8 @@ #pragma once +#include +#include #include "lib/rocprofiler-sdk/hsa/agent_cache.hpp" #include @@ -30,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -56,7 +59,7 @@ struct thread_trace_parameter_pack static constexpr size_t DEFAULT_SIMD = 0x7; static constexpr size_t DEFAULT_SE_MASK = 0x21; - static constexpr size_t DEFAULT_BUFFER_SIZE = 0x6000000; + static constexpr size_t DEFAULT_BUFFER_SIZE = 0x8000000; }; namespace hsa @@ -64,21 +67,98 @@ namespace hsa class AQLPacket; }; -class ThreadTracer +struct ThreadTraceActiveResource { + rocprofiler_correlation_id_t corr_id; + rocprofiler_queue_id_t queue_id; + std::unique_ptr packet{nullptr}; +}; + +class AgentThreadTracer +{ + using code_object_id_t = uint64_t; + struct CodeobjRecord + { + code_object_id_t id; + uint64_t addr; + uint64_t size; + bool bUnload; + }; + public: - ThreadTracer(std::shared_ptr& _params) - : params(_params){}; + AgentThreadTracer(thread_trace_parameter_pack _params, + const hsa::AgentCache&, + const CoreApiTable&, + const AmdExtTable&); + virtual ~AgentThreadTracer(); + + void load_codeobj(code_object_id_t id, uint64_t addr, uint64_t size); + void unload_codeobj(code_object_id_t id); + + std::unique_ptr pre_kernel_call(rocprofiler_att_control_flags_t control_flags, + rocprofiler_queue_id_t queue_id, + rocprofiler_correlation_id_t corr_id); + + void post_kernel_call(std::unique_ptr&& aql); + + hsa_queue_t* queue = nullptr; + std::mutex trace_resources_mut; + thread_trace_parameter_pack params; + std::unique_ptr cached_resources; + ThreadTraceActiveResource active_resources; + std::atomic data_is_ready{0}; + std::atomic active_queues{1}; + std::vector remaining_codeobj_record; + + std::unique_ptr factory; + +private: + bool Submit(hsa_ext_amd_aql_pm4_packet_t* packet); + + decltype(hsa_queue_load_read_index_relaxed)* load_read_index_relaxed_fn{nullptr}; + decltype(hsa_queue_add_write_index_relaxed)* add_write_index_relaxed_fn{nullptr}; + decltype(hsa_signal_store_screlease)* signal_store_screlease_fn{nullptr}; + decltype(hsa_queue_destroy)* queue_destroy_fn{nullptr}; +}; // namespace thread_trace + +class GlobalThreadTracer +{ + struct CodeobjAddrRange + { + int64_t addr; + uint64_t size; + }; + using AQLPacketPtr = std::unique_ptr; + using inst_pkt_t = common::container::small_vector, 4>; + using corr_id_map_t = hsa::Queue::queue_info_session_t::external_corr_id_map_t; + using code_object_id_t = uint64_t; + +public: + GlobalThreadTracer(thread_trace_parameter_pack _params) + : params(std::move(_params)){}; virtual void start_context(); virtual void stop_context(); virtual void resource_init(const hsa::AgentCache&, const CoreApiTable&, const AmdExtTable&); virtual void resource_deinit(const hsa::AgentCache&); - virtual ~ThreadTracer() = default; + virtual ~GlobalThreadTracer() = default; - std::mutex trace_resources_mut; - std::shared_ptr params; - std::unordered_map> resources; - std::unordered_map> agent_active_queues; + static void codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* user_data, + void* callback_data); + + std::unique_ptr pre_kernel_call(const hsa::Queue& queue, + uint64_t kernel_id, + const context::correlation_id* corr_id); + + void post_kernel_call(inst_pkt_t& aql); + + std::map> loaded_codeobjs; + std::unordered_map> agents; + + std::atomic post_move_data{0}; + std::shared_mutex agents_map_mut; + rocprofiler_context_id_t codeobj_client_ctx{0}; + thread_trace_parameter_pack params; }; // namespace thread_trace } // namespace rocprofiler diff --git a/source/lib/rocprofiler-sdk/thread_trace/att_service.cpp b/source/lib/rocprofiler-sdk/thread_trace/att_service.cpp index 56c2043499..39b7dcffa1 100644 --- a/source/lib/rocprofiler-sdk/thread_trace/att_service.cpp +++ b/source/lib/rocprofiler-sdk/thread_trace/att_service.cpp @@ -25,6 +25,7 @@ #include "lib/rocprofiler-sdk/aql/helpers.hpp" #include "lib/rocprofiler-sdk/context/context.hpp" #include "lib/rocprofiler-sdk/hsa/agent_cache.hpp" +#include "lib/rocprofiler-sdk/registration.hpp" extern "C" { rocprofiler_status_t ROCPROFILER_API @@ -35,16 +36,20 @@ rocprofiler_configure_thread_trace_service(rocprofiler_context_id_t rocprofiler_att_shader_data_callback_t shader_callback, void* callback_userdata) { + if(rocprofiler::registration::get_init_status() > -1) + return ROCPROFILER_STATUS_ERROR_CONFIGURATION_LOCKED; + auto* ctx = rocprofiler::context::get_mutable_registered_context(context_id); if(!ctx) return ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_STARTED; if(ctx->thread_trace) return ROCPROFILER_STATUS_ERROR_SERVICE_ALREADY_CONFIGURED; - auto thread_tracer = std::make_shared(); + auto param_pack = rocprofiler::thread_trace_parameter_pack{}; - thread_tracer->context_id = context_id; - thread_tracer->dispatch_cb_fn = dispatch_callback; - thread_tracer->shader_cb_fn = shader_callback; - thread_tracer->callback_userdata = callback_userdata; + param_pack.context_id = context_id; + param_pack.dispatch_cb_fn = dispatch_callback; + param_pack.shader_cb_fn = shader_callback; + param_pack.callback_userdata = callback_userdata; + bool bEnableCodeobj = false; for(size_t p = 0; p < num_parameters; p++) { @@ -54,30 +59,38 @@ rocprofiler_configure_thread_trace_service(rocprofiler_context_id_t switch(param.type) { - case ROCPROFILER_ATT_PARAMETER_TARGET_CU: thread_tracer->target_cu = param.value; break; + case ROCPROFILER_ATT_PARAMETER_TARGET_CU: param_pack.target_cu = param.value; break; case ROCPROFILER_ATT_PARAMETER_SHADER_ENGINE_MASK: - thread_tracer->shader_engine_mask = param.value; + param_pack.shader_engine_mask = param.value; break; - case ROCPROFILER_ATT_PARAMETER_BUFFER_SIZE: - thread_tracer->buffer_size = param.value; + case ROCPROFILER_ATT_PARAMETER_BUFFER_SIZE: param_pack.buffer_size = param.value; break; + case ROCPROFILER_ATT_PARAMETER_SIMD_SELECT: param_pack.simd_select = param.value; break; + case ROCPROFILER_ATT_PARAMETER_CODE_OBJECT_TRACE_ENABLE: + bEnableCodeobj = param.value != 0; break; - case ROCPROFILER_ATT_PARAMETER_SIMD_SELECT: - thread_tracer->simd_select = param.value; - break; - case ROCPROFILER_ATT_PARAMETER_OCCUPANCY_MODE_ENABLE: - return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED; - case ROCPROFILER_ATT_PARAMETER_PERFCOUNTERS_CTRL: - return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED; - case ROCPROFILER_ATT_PARAMETER_PERFCOUNTER: - return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED; case ROCPROFILER_ATT_PARAMETER_LAST: return ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT; } // for(int i = 0; i < parameters.perfcounter_num; i++) // thread_tracer->perfcounters.emplace_back(parameters.perfcounter[i]); } - ctx->thread_trace = std::make_shared(thread_tracer); + ctx->thread_trace = std::make_shared(param_pack); - return ROCPROFILER_STATUS_SUCCESS; + if(!bEnableCodeobj) return ROCPROFILER_STATUS_SUCCESS; // Skip TRACING_CODE_OBJECT setup + + auto& client_ctx = ctx->thread_trace->codeobj_client_ctx; + + rocprofiler_status_t status = rocprofiler_create_context(&client_ctx); + if(status != ROCPROFILER_STATUS_SUCCESS) return status; + + status = rocprofiler_configure_callback_tracing_service( + client_ctx, + ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, + nullptr, + 0, + rocprofiler::GlobalThreadTracer::codeobj_tracing_callback, + ctx->thread_trace.get()); + + return status; } } diff --git a/source/lib/rocprofiler-sdk/thread_trace/tests/att_packet_test.cpp b/source/lib/rocprofiler-sdk/thread_trace/tests/att_packet_test.cpp index a8d83a9bef..b803790261 100644 --- a/source/lib/rocprofiler-sdk/thread_trace/tests/att_packet_test.cpp +++ b/source/lib/rocprofiler-sdk/thread_trace/tests/att_packet_test.cpp @@ -69,12 +69,15 @@ CoreApiTable& get_api_table() { static auto _v = []() { - auto val = CoreApiTable{}; - val.hsa_iterate_agents_fn = hsa_iterate_agents; - val.hsa_agent_get_info_fn = hsa_agent_get_info; - val.hsa_queue_create_fn = hsa_queue_create; - val.hsa_queue_destroy_fn = hsa_queue_destroy; - val.hsa_signal_wait_relaxed_fn = hsa_signal_wait_relaxed; + auto val = CoreApiTable{}; + val.hsa_iterate_agents_fn = hsa_iterate_agents; + val.hsa_agent_get_info_fn = hsa_agent_get_info; + val.hsa_queue_create_fn = hsa_queue_create; + val.hsa_queue_destroy_fn = hsa_queue_destroy; + val.hsa_signal_wait_relaxed_fn = hsa_signal_wait_relaxed; + val.hsa_queue_load_read_index_relaxed_fn = hsa_queue_load_read_index_relaxed; + val.hsa_queue_add_write_index_relaxed_fn = hsa_queue_add_write_index_relaxed; + val.hsa_signal_store_screlease_fn = hsa_signal_store_screlease; return val; }(); return _v; @@ -83,32 +86,40 @@ get_api_table() void test_init() { - HsaApiTable table; - table.amd_ext_ = &get_ext_table(); - table.core_ = &get_api_table(); - agent::construct_agent_cache(&table); - ASSERT_TRUE(hsa::get_queue_controller() != nullptr); - hsa::get_queue_controller()->init(get_api_table(), get_ext_table()); + auto init = []() -> bool { + HsaApiTable table; + table.amd_ext_ = &get_ext_table(); + table.core_ = &get_api_table(); + agent::construct_agent_cache(&table); + hsa::get_queue_controller()->init(get_api_table(), get_ext_table()); + return true; + }; + [[maybe_unused]] static bool run_ince = init(); } } // namespace rocprofiler -using namespace rocprofiler::aql; +using namespace rocprofiler; -TEST(thread_trace, construct_default_packets) +TEST(thread_trace, resource_creation) { ASSERT_EQ(hsa_init(), HSA_STATUS_SUCCESS); - rocprofiler::test_init(); - auto agents = rocprofiler::hsa::get_queue_controller()->get_supported_agents(); + test_init(); + + registration::init_logging(); + registration::set_init_status(-1); + + auto agents = hsa::get_queue_controller()->get_supported_agents(); ASSERT_GT(agents.size(), 0); for(const auto& [_, agent] : agents) { - auto params = std::make_shared(); + auto params = thread_trace_parameter_pack{}; - ThreadTraceAQLPacketFactory factory( - agent, params, rocprofiler::get_api_table(), rocprofiler::get_ext_table()); + aql::ThreadTraceAQLPacketFactory factory(agent, params, get_api_table(), get_ext_table()); auto packet = factory.construct_packet(); + packet->populate_before(); + packet->populate_after(); size_t vendor_packet = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE; ASSERT_TRUE(packet->start.header == vendor_packet); @@ -116,16 +127,42 @@ TEST(thread_trace, construct_default_packets) ASSERT_TRUE(packet->before_krn_pkt.size() > 0); ASSERT_TRUE(packet->after_krn_pkt.size() > 0); } + + { + thread_trace_parameter_pack params{}; + GlobalThreadTracer tracer(std::move(params)); + + for(const auto& [_, agent] : agents) + { + // Init twice to simulate two queues + tracer.resource_init(agent, get_api_table(), get_ext_table()); + tracer.resource_init(agent, get_api_table(), get_ext_table()); + } + + for(auto& [_, agenttracer] : tracer.agents) + { + agenttracer->load_codeobj(1, 0x1000, 0x1000); + agenttracer->load_codeobj(2, 0x3000, 0x1000); + agenttracer->unload_codeobj(1); + } + + for(const auto& [_, agent] : agents) + { + // Deinit twice to remove both queues + tracer.resource_deinit(agent); + tracer.resource_deinit(agent); + } + } hsa_shut_down(); } TEST(thread_trace, configure_test) { - rocprofiler::test_init(); + test_init(); - rocprofiler::registration::init_logging(); - rocprofiler::registration::set_init_status(-1); - rocprofiler::context::push_client(1); + registration::init_logging(); + registration::set_init_status(-1); + context::push_client(1); rocprofiler_context_id_t ctx; ROCPROFILER_CALL(rocprofiler_create_context(&ctx), "context creation failed"); @@ -142,8 +179,7 @@ TEST(thread_trace, configure_test) [](rocprofiler_queue_id_t, const rocprofiler_agent_t*, rocprofiler_correlation_id_t, - const hsa_kernel_dispatch_packet_t*, - uint64_t, + rocprofiler_kernel_id_t, void*) { return ROCPROFILER_ATT_CONTROL_NONE; }, [](int64_t, void*, size_t, void*) {}, nullptr); @@ -151,4 +187,5 @@ TEST(thread_trace, configure_test) ASSERT_EQ(hsa_init(), HSA_STATUS_SUCCESS); ROCPROFILER_CALL(rocprofiler_start_context(ctx), "context start failed"); ROCPROFILER_CALL(rocprofiler_stop_context(ctx), "context stop failed"); + hsa_shut_down(); } diff --git a/tests/thread-trace/CMakeLists.txt b/tests/thread-trace/CMakeLists.txt index 1bd77540d1..d2461ca143 100644 --- a/tests/thread-trace/CMakeLists.txt +++ b/tests/thread-trace/CMakeLists.txt @@ -104,25 +104,54 @@ foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) endif() endforeach() -set_source_files_properties(kernel_run.cpp PROPERTIES COMPILE_FLAGS "-g -O2") -set_source_files_properties(kernel_run.cpp PROPERTIES LANGUAGE HIP) +set_source_files_properties(kernel_branch.cpp PROPERTIES COMPILE_FLAGS "-g -O2") +set_source_files_properties(kernel_branch.cpp PROPERTIES LANGUAGE HIP) +set_source_files_properties(kernel_lds.cpp PROPERTIES COMPILE_FLAGS "-g -O2") +set_source_files_properties(kernel_lds.cpp PROPERTIES LANGUAGE HIP) +set_source_files_properties(main.cpp PROPERTIES LANGUAGE HIP) -add_executable(thread-trace-api-test-binary) -target_sources(thread-trace-api-test-binary PRIVATE kernel_run.cpp verify_data.cpp) +# Single dispatch test +add_executable(thread-trace-api-single-test) +target_sources( + thread-trace-api-single-test PRIVATE main.cpp trace_callbacks.cpp single_dispatch.cpp + kernel_branch.cpp kernel_lds.cpp) +target_link_libraries(thread-trace-api-single-test PRIVATE rocprofiler::rocprofiler + amd_comgr dw) if(ROCPROFILER_MEMCHECK_PRELOAD_ENV) set(PRELOAD_ENV - "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}:$") + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}:$") else() - set(PRELOAD_ENV "LD_PRELOAD=$") + set(PRELOAD_ENV "LD_PRELOAD=$") endif() -target_link_libraries(thread-trace-api-test-binary PRIVATE rocprofiler::rocprofiler - libdw::libdw amd_comgr) - -add_test(NAME thread-trace-api-tests COMMAND $) +add_test(NAME thread-trace-api-single-test + COMMAND $) set_tests_properties( - thread-trace-api-tests - PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT "${PRELOAD_ENV}" + thread-trace-api-single-test + PROPERTIES TIMEOUT 10 LABELS "integration-tests" ENVIRONMENT "${PRELOAD_ENV}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") + +# Multi dispatch test +add_executable(thread-trace-api-multi-test) +target_sources( + thread-trace-api-multi-test PRIVATE main.cpp trace_callbacks.cpp multi_dispatch.cpp + kernel_branch.cpp kernel_lds.cpp) +target_link_libraries(thread-trace-api-multi-test PRIVATE rocprofiler::rocprofiler + amd_comgr dw) + +if(ROCPROFILER_MEMCHECK_PRELOAD_ENV) + set(PRELOAD_ENV + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}:$") +else() + set(PRELOAD_ENV "LD_PRELOAD=$") +endif() + +add_test(NAME thread-trace-api-multi-test + COMMAND $) + +set_tests_properties( + thread-trace-api-multi-test + PROPERTIES TIMEOUT 10 LABELS "integration-tests" ENVIRONMENT "${PRELOAD_ENV}" FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") diff --git a/tests/thread-trace/common.hpp b/tests/thread-trace/common.hpp new file mode 100644 index 0000000000..2e4f73ee64 --- /dev/null +++ b/tests/thread-trace/common.hpp @@ -0,0 +1,96 @@ +#pragma once +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ROCPROFILER_CALL(result, msg) \ + { \ + rocprofiler_status_t CHECKSTATUS = result; \ + if(CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS) \ + { \ + std::string status_msg = rocprofiler_get_status_string(CHECKSTATUS); \ + std::cerr << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg \ + << " failed with error code " << CHECKSTATUS << ": " << status_msg \ + << std::endl; \ + std::stringstream errmsg{}; \ + errmsg << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg " failure (" \ + << status_msg << ")"; \ + throw std::runtime_error(errmsg.str()); \ + } \ + } + +#define C_API_BEGIN \ + try \ + { +#define C_API_END \ + } \ + catch(std::exception & e) \ + { \ + std::cerr << "Error in " << __FILE__ << ':' << __LINE__ << ' ' << e.what() << std::endl; \ + } \ + catch(...) { std::cerr << "Error in " << __FILE__ << ':' << __LINE__ << std::endl; } + +namespace ATTTest +{ +struct TrackedIsa +{ + std::atomic hitcount{0}; + std::atomic latency{0}; + std::string inst{}; +}; + +struct pcInfo +{ + size_t addr; + size_t marker_id; + + bool operator==(const pcInfo& other) const + { + return addr == other.addr && marker_id == other.marker_id; + } + bool operator<(const pcInfo& other) const + { + if(marker_id == other.marker_id) return addr < other.addr; + return marker_id < other.marker_id; + } +}; + +struct ToolData +{ + std::unordered_map kernel_id_to_kernel_name = {}; + std::map> isa_map; + + std::atomic waves_started = 0; + std::atomic waves_ended = 0; + std::mutex isa_map_mut; + std::set wave_start_locations{}; +}; + +namespace Callbacks +{ +void +tool_codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t*, + void* callback_data); + +void +shader_data_callback(int64_t se_id, void* se_data, size_t data_size, void* userdata); + +void +callbacks_init(); + +void +callbacks_fini(); + +}; // namespace Callbacks + +}; // namespace ATTTest \ No newline at end of file diff --git a/tests/thread-trace/kernel_branch.cpp b/tests/thread-trace/kernel_branch.cpp new file mode 100644 index 0000000000..d3c7aeeabf --- /dev/null +++ b/tests/thread-trace/kernel_branch.cpp @@ -0,0 +1,39 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +// undefine NDEBUG so asserts are implemented +#ifdef NDEBUG +# undef NDEBUG +#endif + +#include +#include "hip/hip_runtime.h" + +__global__ void +branching_kernel(float* __restrict__ a, const float* __restrict__ b, const float* __restrict__ c) +{ + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + if(blockIdx.x % 2 == 0) + a[index] = b[index] + c[index]; + else + a[index] = b[index] * c[index] - 2.0f; +} diff --git a/tests/thread-trace/kernel_lds.cpp b/tests/thread-trace/kernel_lds.cpp new file mode 100644 index 0000000000..bf767a75fc --- /dev/null +++ b/tests/thread-trace/kernel_lds.cpp @@ -0,0 +1,56 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +// undefine NDEBUG so asserts are implemented +#ifdef NDEBUG +# undef NDEBUG +#endif + +#include +#include "hip/hip_runtime.h" + +#define SHM_SIZE 64 + +__global__ void +looping_lds_kernel(float* __restrict__ a, + const float* __restrict__ b, + const float* __restrict__ c, + size_t size, + size_t loopcount) +{ + __shared__ float interm[SHM_SIZE]; + + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + + for(size_t i = index; i < size; i += blockDim.x * gridDim.x) + interm[threadIdx.x % SHM_SIZE] = b[index] + threadIdx.x; + + for(size_t it = 0; it < loopcount; it++) + { + __syncthreads(); + float value = interm[(it + threadIdx.x + SHM_SIZE / 2) % SHM_SIZE]; + __syncthreads(); + interm[threadIdx.x % SHM_SIZE] += value; + } + + a[index] = interm[threadIdx.x % SHM_SIZE] + c[index]; +} diff --git a/tests/thread-trace/kernel_run.cpp b/tests/thread-trace/main.cpp similarity index 62% rename from tests/thread-trace/kernel_run.cpp rename to tests/thread-trace/main.cpp index c174db8483..507fde9b0e 100644 --- a/tests/thread-trace/kernel_run.cpp +++ b/tests/thread-trace/main.cpp @@ -25,34 +25,27 @@ # undef NDEBUG #endif -/** - * @file samples/code_object_isa_decode/client.cpp - * - * @brief Example rocprofiler client (tool) - */ - #include #include #include #include "hip/hip_runtime.h" -// Three waves per SIMD on MI300 -#define DATA_SIZE (304 * 64 * 4 * 3) +// Two waves per SIMD on MI300 +#define DATA_SIZE (304 * 64 * 4 * 2) #define HIP_API_CALL(CALL) assert((CALL) == hipSuccess) -template +#define SHM_SIZE 64 +#define LOOPCOUNT 4 + __global__ void -branching_kernel(T* __restrict__ a, - const float* __restrict__ b, - const float* __restrict__ c, - int size) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if(blockIdx.x % 2 == 0) - a[index] = b[index] + c[index]; - else - a[index] = b[index] * c[index] - 2.0f; -} +branching_kernel(float* __restrict__ a, const float* __restrict__ b, const float* __restrict__ c); + +__global__ void +looping_lds_kernel(float* __restrict__ a, + const float* __restrict__ b, + const float* __restrict__ c, + size_t size, + size_t loopcount); class hipMemory { @@ -76,18 +69,27 @@ main(int argc, char** argv) hipMemory src2(DATA_SIZE); hipMemory dst(DATA_SIZE); - hipLaunchKernelGGL(branching_kernel, - dim3(DATA_SIZE / 64), - dim3(64), - 0, - 0, - dst.ptr, - src1.ptr, - src2.ptr, - DATA_SIZE); - - HIP_API_CALL(hipGetLastError()); HIP_API_CALL(hipDeviceSynchronize()); + for(size_t i = 0; i < LOOPCOUNT; i++) + { + hipLaunchKernelGGL( + branching_kernel, dim3(DATA_SIZE / 64), dim3(64), 0, 0, dst.ptr, src1.ptr, src2.ptr); + HIP_API_CALL(hipGetLastError()); + + hipLaunchKernelGGL(looping_lds_kernel, + dim3(DATA_SIZE / 64), + dim3(64), + 0, + 0, + dst.ptr, + src1.ptr, + src2.ptr, + DATA_SIZE, + LOOPCOUNT); + HIP_API_CALL(hipGetLastError()); + HIP_API_CALL(hipDeviceSynchronize()); + } + return 0; } \ No newline at end of file diff --git a/tests/thread-trace/multi_dispatch.cpp b/tests/thread-trace/multi_dispatch.cpp new file mode 100644 index 0000000000..46c8c090a1 --- /dev/null +++ b/tests/thread-trace/multi_dispatch.cpp @@ -0,0 +1,185 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +// undefine NDEBUG so asserts are implemented +#ifdef NDEBUG +# undef NDEBUG +#endif + +#include +#include +#include +#include "common.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +constexpr double WAVE_RATIO_TOLERANCE = 0.05; + +namespace ATTTest +{ +namespace Multi +{ +rocprofiler_client_id_t* client_id = nullptr; + +rocprofiler_att_control_flags_t +dispatch_callback(rocprofiler_queue_id_t /* queue_id */, + const rocprofiler_agent_t* /* agent */, + rocprofiler_correlation_id_t /* correlation_id */, + rocprofiler_kernel_id_t kernel_id, + void* userdata) +{ + C_API_BEGIN + assert(userdata && "Dispatch callback passed null!"); + ToolData& tool = *reinterpret_cast(userdata); + + static std::atomic call_id{0}; + static std::string_view desired_func_name = "branching_kernel"; + + try + { + auto& kernel_name = tool.kernel_id_to_kernel_name.at(kernel_id); + if(kernel_name.find(desired_func_name) == std::string::npos) + return ROCPROFILER_ATT_CONTROL_NONE; + + int id = call_id.fetch_add(1); + if(id == 0) + return ROCPROFILER_ATT_CONTROL_START; + else if(id == 1) + return ROCPROFILER_ATT_CONTROL_STOP; + } catch(...) + { + std::cerr << "Could not find kernel id: " << kernel_id << std::endl; + } + + C_API_END + return ROCPROFILER_ATT_CONTROL_NONE; +} + +int +tool_init(rocprofiler_client_finalize_t /* fini_func */, void* tool_data) +{ + Callbacks::callbacks_init(); + static rocprofiler_context_id_t client_ctx = {}; + + ROCPROFILER_CALL(rocprofiler_create_context(&client_ctx), "context creation"); + + ROCPROFILER_CALL( + rocprofiler_configure_callback_tracing_service(client_ctx, + ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, + nullptr, + 0, + Callbacks::tool_codeobj_tracing_callback, + tool_data), + "code object tracing service configure"); + + std::vector params{}; + params.push_back({ROCPROFILER_ATT_PARAMETER_CODE_OBJECT_TRACE_ENABLE, 1}); + + ROCPROFILER_CALL(rocprofiler_configure_thread_trace_service(client_ctx, + params.data(), + params.size(), + dispatch_callback, + Callbacks::shader_data_callback, + tool_data), + "thread trace service configure"); + + int valid_ctx = 0; + ROCPROFILER_CALL(rocprofiler_context_is_valid(client_ctx, &valid_ctx), + "context validity check"); + if(valid_ctx == 0) + { + // notify rocprofiler that initialization failed + // and all the contexts, buffers, etc. created + // should be ignored + return -1; + } + + ROCPROFILER_CALL(rocprofiler_start_context(client_ctx), "context start"); + + // no errors + return 0; +} + +void +tool_fini(void* tool_data) +{ + assert(tool_data && "tool_fini callback passed null!"); + ToolData& tool = *reinterpret_cast(tool_data); + + double wave_started = (double) tool.waves_started.load(); + double wave_event_ratio = wave_started / (wave_started + (double) tool.waves_ended.load()); + assert(wave_event_ratio > 0.5 - WAVE_RATIO_TOLERANCE); + assert(wave_event_ratio < 0.5 + WAVE_RATIO_TOLERANCE); + + // Expected: Two kernels in kernel_run.cpp + assert(tool.wave_start_locations.size() >= 2); + + // Expected at least one known code object ID + bool bHasMarkerId = false; + for(auto& pc : tool.wave_start_locations) + bHasMarkerId |= pc.marker_id != 0; + + assert(bHasMarkerId); + + Callbacks::callbacks_fini(); +} + +} // namespace Multi +} // namespace ATTTest + +extern "C" rocprofiler_tool_configure_result_t* +rocprofiler_configure(uint32_t /* version */, + const char* /* runtime_version */, + uint32_t priority, + rocprofiler_client_id_t* id) +{ + // only activate if main tool + if(priority > 0) return nullptr; + + // set the client name + id->name = "ATT_test_multi_dispatch"; + + // store client info + ATTTest::Multi::client_id = id; + + auto* data = new ATTTest::ToolData{}; + + // create configure data + static auto cfg = + rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t), + &ATTTest::Multi::tool_init, + &ATTTest::Multi::tool_fini, + reinterpret_cast(data)}; + + // return pointer to configure data + return &cfg; +} diff --git a/tests/thread-trace/single_dispatch.cpp b/tests/thread-trace/single_dispatch.cpp new file mode 100644 index 0000000000..79a9e90ccb --- /dev/null +++ b/tests/thread-trace/single_dispatch.cpp @@ -0,0 +1,185 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +// undefine NDEBUG so asserts are implemented +#ifdef NDEBUG +# undef NDEBUG +#endif + +#include +#include +#include +#include "common.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +constexpr double WAVE_RATIO_TOLERANCE = 0.05; + +namespace ATTTest +{ +namespace Single +{ +rocprofiler_client_id_t* client_id = nullptr; + +rocprofiler_att_control_flags_t +dispatch_callback(rocprofiler_queue_id_t /* queue_id */, + const rocprofiler_agent_t* /* agent */, + rocprofiler_correlation_id_t /* correlation_id */, + rocprofiler_kernel_id_t kernel_id, + void* userdata) +{ + C_API_BEGIN + assert(userdata && "Dispatch callback passed null!"); + ToolData& tool = *reinterpret_cast(userdata); + + static std::atomic call_id{0}; + static std::string_view desired_func_name = "branching_kernel"; + + try + { + auto& kernel_name = tool.kernel_id_to_kernel_name.at(kernel_id); + if(kernel_name.find(desired_func_name) == std::string::npos) + return ROCPROFILER_ATT_CONTROL_NONE; + + if(call_id.fetch_add(1) == 0) return ROCPROFILER_ATT_CONTROL_START_AND_STOP; + } catch(...) + { + std::cerr << "Could not find kernel id: " << kernel_id << std::endl; + } + + C_API_END + return ROCPROFILER_ATT_CONTROL_NONE; +} + +int +tool_init(rocprofiler_client_finalize_t /* fini_func */, void* tool_data) +{ + Callbacks::callbacks_init(); + static rocprofiler_context_id_t client_ctx = {}; + + ROCPROFILER_CALL(rocprofiler_create_context(&client_ctx), "context creation"); + + ROCPROFILER_CALL( + rocprofiler_configure_callback_tracing_service(client_ctx, + ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, + nullptr, + 0, + Callbacks::tool_codeobj_tracing_callback, + tool_data), + "code object tracing service configure"); + + ROCPROFILER_CALL( + rocprofiler_configure_thread_trace_service( + client_ctx, nullptr, 0, dispatch_callback, Callbacks::shader_data_callback, tool_data), + "thread trace service configure"); + + int valid_ctx = 0; + ROCPROFILER_CALL(rocprofiler_context_is_valid(client_ctx, &valid_ctx), + "context validity check"); + if(valid_ctx == 0) + { + // notify rocprofiler that initialization failed + // and all the contexts, buffers, etc. created + // should be ignored + return -1; + } + + ROCPROFILER_CALL(rocprofiler_start_context(client_ctx), "context start"); + + // no errors + return 0; +} + +void +tool_fini(void* tool_data) +{ + assert(tool_data && "tool_fini callback passed null!"); + ToolData& tool = *reinterpret_cast(tool_data); + + std::unique_lock isa_lk(tool.isa_map_mut); + + // Find largest instruction + size_t max_inst_size = 0; + for(auto& [addr, lines] : tool.isa_map) + max_inst_size = std::max(max_inst_size, lines->inst.size()); + + size_t total_hit = 0; + size_t total_cycles = 0; + + for(auto& [addr, line] : tool.isa_map) + { + total_hit += line->hitcount.load(std::memory_order_relaxed); + total_cycles += line->latency.load(std::memory_order_relaxed); + } + + assert(total_cycles > 0); + assert(total_hit > 0); + + double wave_started = (double) tool.waves_started.load(); + double wave_event_ratio = wave_started / (wave_started + (double) tool.waves_ended.load()); + assert(wave_event_ratio > 0.5 - WAVE_RATIO_TOLERANCE); + assert(wave_event_ratio < 0.5 + WAVE_RATIO_TOLERANCE); + + Callbacks::callbacks_fini(); +} + +} // namespace Single +} // namespace ATTTest + +extern "C" rocprofiler_tool_configure_result_t* +rocprofiler_configure(uint32_t /* version */, + const char* /* runtime_version */, + uint32_t priority, + rocprofiler_client_id_t* id) +{ + // only activate if main tool + if(priority > 0) return nullptr; + + // set the client name + id->name = "ATT_test_single_dispatch"; + + // store client info + ATTTest::Single::client_id = id; + + auto* data = new ATTTest::ToolData{}; + + // create configure data + static auto cfg = + rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t), + &ATTTest::Single::tool_init, + &ATTTest::Single::tool_fini, + reinterpret_cast(data)}; + + // return pointer to configure data + return &cfg; +} diff --git a/tests/thread-trace/trace_callbacks.cpp b/tests/thread-trace/trace_callbacks.cpp new file mode 100644 index 0000000000..0c0b56b018 --- /dev/null +++ b/tests/thread-trace/trace_callbacks.cpp @@ -0,0 +1,238 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +// undefine NDEBUG so asserts are implemented +#ifdef NDEBUG +# undef NDEBUG +#endif + +#include +#include +#include +#include +#include +#include "common.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ATTTest +{ +namespace Callbacks +{ +using code_obj_load_data_t = rocprofiler_callback_tracing_code_object_load_data_t; +using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; +using CodeobjAddressTranslate = rocprofiler::codeobj::disassembly::CodeobjAddressTranslate; +using Instruction = rocprofiler::codeobj::disassembly::Instruction; + +CodeobjAddressTranslate* codeobjTranslate = nullptr; + +struct trace_data_t +{ + int64_t id; + uint8_t* data; + uint64_t size; + ToolData* tool; +}; + +void +tool_codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* /* user_data */, + void* callback_data) +{ + C_API_BEGIN + if(record.kind != ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT) return; + if(record.phase != ROCPROFILER_CALLBACK_PHASE_LOAD) return; + + assert(callback_data && "Shader callback passed null!"); + ToolData& tool = *reinterpret_cast(callback_data); + + if(record.operation == ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER) + { + std::unique_lock lg(tool.isa_map_mut); + auto* data = static_cast(record.payload); + tool.kernel_id_to_kernel_name.emplace(data->kernel_id, data->kernel_name); + } + + if(record.operation != ROCPROFILER_CODE_OBJECT_LOAD) return; + + auto* data = static_cast(record.payload); + if(!data || !data->uri) return; + + std::unique_lock lg(tool.isa_map_mut); + + if(std::string_view(data->uri).find("file:///") == 0) + { + codeobjTranslate->addDecoder( + data->uri, data->code_object_id, data->load_delta, data->load_size); + } + else + { + codeobjTranslate->addDecoder(reinterpret_cast(data->memory_base), + data->memory_size, + data->code_object_id, + data->load_delta, + data->load_size); + } + + C_API_END +} + +void +get_trace_data(rocprofiler_att_parser_data_type_t type, void* att_data, void* userdata) +{ + C_API_BEGIN + assert(userdata && "ISA callback passed null!"); + trace_data_t& trace_data = *reinterpret_cast(userdata); + assert(trace_data.tool && "ISA callback passed null!"); + ToolData& tool = *reinterpret_cast(trace_data.tool); + + std::unique_lock lk(tool.isa_map_mut); + + if(type == ROCPROFILER_ATT_PARSER_DATA_TYPE_OCCUPANCY) + { + const auto& ev = *reinterpret_cast(att_data); + tool.wave_start_locations.insert({ev.offset, ev.marker_id}); + if(ev.enabled) + tool.waves_started.fetch_add(1); + else + tool.waves_ended.fetch_add(1); + } + + if(type != ROCPROFILER_ATT_PARSER_DATA_TYPE_ISA) return; + + auto& event = *reinterpret_cast(att_data); + + pcInfo pc{event.offset, event.marker_id}; + auto it = tool.isa_map.find(pc); + if(it == tool.isa_map.end()) + { + auto ptr = std::make_unique(); + try + { + auto shared_inst = codeobjTranslate->get(pc.marker_id, pc.addr); + if(shared_inst == nullptr) return; + ptr->inst = shared_inst->inst; + } catch(...) + { + return; + } + it = tool.isa_map.emplace(pc, std::move(ptr)).first; + } + + it->second->hitcount.fetch_add(event.hitcount, std::memory_order_relaxed); + it->second->latency.fetch_add(event.latency, std::memory_order_relaxed); + C_API_END +} + +uint64_t +copy_trace_data(int* seid, uint8_t** buffer, uint64_t* buffer_size, void* userdata) +{ + trace_data_t& data = *reinterpret_cast(userdata); + *seid = data.id; + *buffer_size = data.size; + *buffer = data.data; + data.size = 0; + return *buffer_size; +} + +rocprofiler_status_t +isa_callback(char* isa_instruction, + uint64_t* isa_memory_size, + uint64_t* isa_size, + uint64_t marker_id, + uint64_t offset, + void* userdata) +{ + C_API_BEGIN + assert(userdata && "ISA callback passed null!"); + trace_data_t& trace_data = *reinterpret_cast(userdata); + assert(trace_data.tool && "ISA callback passed null!"); + ToolData& tool = *reinterpret_cast(trace_data.tool); + + std::shared_ptr instruction; + + try + { + std::unique_lock unique_lock(tool.isa_map_mut); + instruction = codeobjTranslate->get(marker_id, offset); + } catch(...) + { + return ROCPROFILER_STATUS_ERROR; + } + + if(!instruction.get()) return ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT; + + { + size_t tmp_isa_size = *isa_size; + *isa_size = instruction->inst.size(); + + if(*isa_size > tmp_isa_size) return ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES; + } + + memcpy(isa_instruction, instruction->inst.data(), *isa_size); + *isa_memory_size = instruction->size; + + auto ptr = std::make_unique(); + ptr->inst = instruction->inst; + tool.isa_map.emplace(pcInfo{offset, marker_id}, std::move(ptr)); + return ROCPROFILER_STATUS_SUCCESS; + C_API_END + return ROCPROFILER_STATUS_ERROR; +} + +void +shader_data_callback(int64_t se_id, void* se_data, size_t data_size, void* userdata) +{ + C_API_BEGIN + assert(userdata && "Shader callback passed null!"); + ToolData& tool = *reinterpret_cast(userdata); + + trace_data_t data{.id = se_id, .data = (uint8_t*) se_data, .size = data_size, .tool = &tool}; + auto status = rocprofiler_att_parse_data(copy_trace_data, get_trace_data, isa_callback, &data); + if(status != ROCPROFILER_STATUS_SUCCESS) + std::cerr << "shader_data_callback failed with status " << status << std::endl; + C_API_END +} + +void +callbacks_init() +{ + codeobjTranslate = new CodeobjAddressTranslate(); +} +void +callbacks_fini() +{ + delete codeobjTranslate; +} + +} // namespace Callbacks +} // namespace ATTTest \ No newline at end of file diff --git a/tests/thread-trace/verify_data.cpp b/tests/thread-trace/verify_data.cpp deleted file mode 100644 index 59917613b6..0000000000 --- a/tests/thread-trace/verify_data.cpp +++ /dev/null @@ -1,415 +0,0 @@ -// MIT License -// -// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -// -// undefine NDEBUG so asserts are implemented -#ifdef NDEBUG -# undef NDEBUG -#endif - -/** - * @file samples/code_object_isa_decode/client.cpp - * - * @brief Example rocprofiler client (tool) - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define WAVE_RATIO_TOLERANCE 0.05 - -#define ROCPROFILER_CALL(result, msg) \ - { \ - rocprofiler_status_t CHECKSTATUS = result; \ - if(CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS) \ - { \ - std::string status_msg = rocprofiler_get_status_string(CHECKSTATUS); \ - std::cerr << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg \ - << " failed with error code " << CHECKSTATUS << ": " << status_msg \ - << std::endl; \ - std::stringstream errmsg{}; \ - errmsg << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg " failure (" \ - << status_msg << ")"; \ - throw std::runtime_error(errmsg.str()); \ - } \ - } - -#define C_API_BEGIN \ - try \ - { -#define C_API_END \ - } \ - catch(std::exception & e) \ - { \ - std::cerr << "Error in " << __FILE__ << ':' << __LINE__ << ' ' << e.what() << std::endl; \ - } \ - catch(...) { std::cerr << "Error in " << __FILE__ << ':' << __LINE__ << std::endl; } - -namespace thread_trace_test_client -{ -using code_obj_load_data_t = rocprofiler_callback_tracing_code_object_load_data_t; -using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; -using Instruction = rocprofiler::codeobj::disassembly::Instruction; -using CodeobjAddressTranslate = rocprofiler::codeobj::disassembly::CodeobjAddressTranslate; - -std::mutex isa_map_mut; -rocprofiler_client_id_t* client_id = nullptr; - -struct isa_map_elem_t -{ - std::atomic hitcount{0}; - std::atomic latency{0}; - std::shared_ptr code_line{nullptr}; -}; - -struct pcinfo_t -{ - uint64_t marker_id; - uint64_t addr; -}; - -bool -operator==(const pcinfo_t& a, const pcinfo_t& b) -{ - return a.addr == b.addr && a.marker_id == b.marker_id; -}; - -bool -operator<(const pcinfo_t& a, const pcinfo_t& b) -{ - if(a.marker_id == b.marker_id) return a.addr < b.addr; - return a.marker_id < b.marker_id; -}; - -struct ToolData -{ - std::unordered_map kernel_object_to_kernel_name = {}; - CodeobjAddressTranslate codeobjTranslate; - std::map> isa_map; - std::atomic waves_started = 0; - std::atomic waves_ended = 0; -}; - -struct trace_data_t -{ - int64_t id; - uint8_t* data; - uint64_t size; - ToolData* tool; -}; - -void -tool_codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record, - rocprofiler_user_data_t* /* user_data */, - void* callback_data) -{ - C_API_BEGIN - if(record.kind != ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT) return; - if(record.phase != ROCPROFILER_CALLBACK_PHASE_LOAD) return; - - assert(callback_data && "Shader callback passed null!"); - ToolData& tool = *reinterpret_cast(callback_data); - - if(record.operation == ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER) - { - std::unique_lock lg(isa_map_mut); - auto* data = static_cast(record.payload); - tool.kernel_object_to_kernel_name.emplace(data->kernel_object, data->kernel_name); - } - - if(record.operation != ROCPROFILER_CODE_OBJECT_LOAD) return; - - auto* data = static_cast(record.payload); - if(!data || !data->uri) return; - - std::unique_lock lg(isa_map_mut); - - if(std::string_view(data->uri).find("file:///") == 0) - { - tool.codeobjTranslate.addDecoder(data->uri, 0, data->load_delta, data->load_size); - } - else - { - tool.codeobjTranslate.addDecoder(reinterpret_cast(data->memory_base), - data->memory_size, - data->code_object_id, - data->load_delta, - data->load_size); - } - - C_API_END -} - -rocprofiler_att_control_flags_t -dispatch_callback(rocprofiler_queue_id_t /* queue_id */, - const rocprofiler_agent_t* /* agent */, - rocprofiler_correlation_id_t /* correlation_id */, - const hsa_kernel_dispatch_packet_t* dispatch_packet, - uint64_t /* kernel_id */, - void* userdata) -{ - C_API_BEGIN - assert(userdata && "Dispatch callback passed null!"); - ToolData& tool = *reinterpret_cast(userdata); - - static std::atomic call_id{0}; - static std::string_view desired_func_name = "branching_kernel"; - - try - { - auto& kernel_name = tool.kernel_object_to_kernel_name.at(dispatch_packet->kernel_object); - if(kernel_name.find(desired_func_name) == std::string::npos) - return ROCPROFILER_ATT_CONTROL_NONE; - - if(call_id.fetch_add(1) == 0) return ROCPROFILER_ATT_CONTROL_START_AND_STOP; - } catch(...) - { - std::cerr << "Could not find kernel object: " << dispatch_packet->kernel_object - << std::endl; - } - - C_API_END - return ROCPROFILER_ATT_CONTROL_NONE; -} - -void -get_trace_data(rocprofiler_att_parser_data_type_t type, void* att_data, void* userdata) -{ - C_API_BEGIN - assert(userdata && "ISA callback passed null!"); - trace_data_t& trace_data = *reinterpret_cast(userdata); - assert(trace_data.tool && "ISA callback passed null!"); - ToolData& tool = *reinterpret_cast(trace_data.tool); - - if(type == ROCPROFILER_ATT_PARSER_DATA_TYPE_OCCUPANCY) - { - const auto& ev = reinterpret_cast(att_data); - if(ev->enabled) - tool.waves_started.fetch_add(1); - else - tool.waves_ended.fetch_add(1); - } - - if(type != ROCPROFILER_ATT_PARSER_DATA_TYPE_ISA) return; - - std::unique_lock lk(isa_map_mut); - auto& event = *reinterpret_cast(att_data); - - pcinfo_t pc{event.marker_id, event.offset}; - auto it = tool.isa_map.find(pc); - if(it == tool.isa_map.end()) - { - auto ptr = std::make_unique(); - try - { - ptr->code_line = tool.codeobjTranslate.get(pc.marker_id, pc.addr); - } catch(...) - { - return; - } - it = tool.isa_map.emplace(pc, std::move(ptr)).first; - } - - it->second->hitcount.fetch_add(event.hitcount, std::memory_order_relaxed); - it->second->latency.fetch_add(event.latency, std::memory_order_relaxed); - C_API_END -} - -uint64_t -copy_trace_data(int* seid, uint8_t** buffer, uint64_t* buffer_size, void* userdata) -{ - trace_data_t& data = *reinterpret_cast(userdata); - *seid = data.id; - *buffer_size = data.size; - *buffer = data.data; - data.size = 0; - return *buffer_size; -} - -rocprofiler_status_t -isa_callback(char* isa_instruction, - uint64_t* isa_memory_size, - uint64_t* isa_size, - uint64_t marker_id, - uint64_t offset, - void* userdata) -{ - C_API_BEGIN - assert(userdata && "ISA callback passed null!"); - trace_data_t& trace_data = *reinterpret_cast(userdata); - assert(trace_data.tool && "ISA callback passed null!"); - ToolData& tool = *reinterpret_cast(trace_data.tool); - - std::shared_ptr instruction; - - { - std::unique_lock unique_lock(isa_map_mut); - instruction = tool.codeobjTranslate.get(marker_id, offset); - } - - if(!instruction.get()) return ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT; - - { - size_t tmp_isa_size = *isa_size; - *isa_size = instruction->inst.size(); - - if(*isa_size > tmp_isa_size) return ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES; - } - - memcpy(isa_instruction, instruction->inst.data(), *isa_size); - *isa_memory_size = instruction->size; - - auto ptr = std::make_unique(); - ptr->code_line = std::move(instruction); - tool.isa_map.emplace(pcinfo_t{marker_id, offset}, std::move(ptr)); - C_API_END - return ROCPROFILER_STATUS_SUCCESS; -} - -void -shader_data_callback(int64_t se_id, void* se_data, size_t data_size, void* userdata) -{ - C_API_BEGIN - assert(userdata && "Shader callback passed null!"); - ToolData& tool = *reinterpret_cast(userdata); - - trace_data_t data{.id = se_id, .data = (uint8_t*) se_data, .size = data_size, .tool = &tool}; - auto status = rocprofiler_att_parse_data(copy_trace_data, get_trace_data, isa_callback, &data); - if(status != ROCPROFILER_STATUS_SUCCESS) - std::cerr << "shader_data_callback failed with status " << status << std::endl; - C_API_END -} - -int -tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) -{ - static rocprofiler_context_id_t client_ctx = {}; - (void) fini_func; - ROCPROFILER_CALL(rocprofiler_create_context(&client_ctx), "context creation"); - - ROCPROFILER_CALL( - rocprofiler_configure_callback_tracing_service(client_ctx, - ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, - nullptr, - 0, - tool_codeobj_tracing_callback, - tool_data), - "code object tracing service configure"); - - ROCPROFILER_CALL( - rocprofiler_configure_thread_trace_service( - client_ctx, nullptr, 0, dispatch_callback, shader_data_callback, tool_data), - "thread trace service configure"); - - int valid_ctx = 0; - ROCPROFILER_CALL(rocprofiler_context_is_valid(client_ctx, &valid_ctx), - "context validity check"); - if(valid_ctx == 0) - { - // notify rocprofiler that initialization failed - // and all the contexts, buffers, etc. created - // should be ignored - return -1; - } - - ROCPROFILER_CALL(rocprofiler_start_context(client_ctx), "context start"); - - // no errors - return 0; -} - -void -tool_fini(void* tool_data) -{ - assert(tool_data && "tool_fini callback passed null!"); - ToolData& tool = *reinterpret_cast(tool_data); - - std::unique_lock isa_lk(isa_map_mut); - - // Find largest instruction - size_t max_inst_size = 0; - for(auto& [addr, lines] : tool.isa_map) - if(lines.get()) max_inst_size = std::max(max_inst_size, lines->code_line->inst.size()); - - assert(max_inst_size > 0); - - size_t total_hit = 0; - size_t total_cycles = 0; - - for(auto& [addr, line] : tool.isa_map) - { - total_hit += line->hitcount.load(std::memory_order_relaxed); - total_cycles += line->latency.load(std::memory_order_relaxed); - } - - assert(total_cycles > 0); - assert(total_hit > 0); - - double wave_started = (double) tool.waves_started.load(); - double wave_event_ratio = wave_started / (wave_started + (double) tool.waves_ended.load()); - assert(wave_event_ratio > 0.5 - WAVE_RATIO_TOLERANCE); - assert(wave_event_ratio < 0.5 + WAVE_RATIO_TOLERANCE); -} - -} // namespace thread_trace_test_client - -extern "C" rocprofiler_tool_configure_result_t* -rocprofiler_configure(uint32_t /* version */, - const char* /* runtime_version */, - uint32_t priority, - rocprofiler_client_id_t* id) -{ - // only activate if main tool - if(priority > 0) return nullptr; - - // set the client name - id->name = "Adv_Thread_Trace_Sample"; - - // store client info - thread_trace_test_client::client_id = id; - - auto* data = new thread_trace_test_client::ToolData{}; - - // create configure data - static auto cfg = - rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t), - &thread_trace_test_client::tool_init, - &thread_trace_test_client::tool_fini, - reinterpret_cast(data)}; - - // return pointer to configure data - return &cfg; -}