From 42c6ffc0eb3d1e50c8b3718c410d4e88ffcf042a Mon Sep 17 00:00:00 2001 From: Vladimir Indic <139573562+vlaindic@users.noreply.github.com> Date: Wed, 20 Nov 2024 21:02:47 +0100 Subject: [PATCH] Host trap PC sampling uses new record type (#1207) * Host trap PC sampling uses new record type * removing redundant field * formatting * simplifying templates in the parser - no need for HostTrap boolean * reviving some parser tests * hw_id decoding on GFX9 * HW id parser test * parser CID test * Parser multigpu test * removing rocprofiler_pc_sampling_record_t and some fields from hw_id * simplifying parser context * keep bench test internally * initializing gfx9_hw_id_t differently * anonymous struct first * avoiding inlining initialization of struct [ROCm/rocprofiler-sdk commit: bc52c17e64d28fbaa47fc892213b7b8faca7f2b4] --- .../samples/pc_sampling/pcs.cpp | 22 +- .../source/include/rocprofiler-sdk/fwd.h | 3 +- .../include/rocprofiler-sdk/pc_sampling.h | 135 ++--- .../lib/rocprofiler-sdk/pc_sampling.cpp | 3 +- .../pc_sampling/parser/CMakeLists.txt | 4 +- .../pc_sampling/parser/correlation.hpp | 77 +-- .../{parser_types.h => parser_types.hpp} | 9 +- .../parser/pc_record_interface.cpp | 74 ++- .../parser/pc_record_interface.hpp | 45 +- .../pc_sampling/parser/stochastic_records.h | 180 ++++++ .../parser/tests/benchmark_test.cpp | 50 +- .../parser/tests/correlation_id_test.cpp | 230 +++++--- .../pc_sampling/parser/tests/gfx9test.cpp | 528 +++++++++++------- .../pc_sampling/parser/tests/mocks.hpp | 53 +- .../pc_sampling/parser/tests/multigpu.cpp | 167 +++--- .../pc_sampling/parser/translation.hpp | 406 +++++++++----- .../pc_sampling/tests/configure_service.cpp | 33 +- .../pc_sampling_vs_counter_collection.cpp | 6 +- .../pc_sampling/tests/query_configuration.cpp | 3 +- .../pc_sampling/tests/samples_processing.cpp | 3 +- .../rocprofiler-sdk/tests/pc_sampling/pcs.cpp | 26 +- 21 files changed, 1319 insertions(+), 738 deletions(-) rename projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/{parser_types.h => parser_types.hpp} (93%) create mode 100644 projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/stochastic_records.h diff --git a/projects/rocprofiler-sdk/samples/pc_sampling/pcs.cpp b/projects/rocprofiler-sdk/samples/pc_sampling/pcs.cpp index d77b0dcc63..ccffc2e5c6 100644 --- a/projects/rocprofiler-sdk/samples/pc_sampling/pcs.cpp +++ b/projects/rocprofiler-sdk/samples/pc_sampling/pcs.cpp @@ -245,7 +245,8 @@ configure_pc_sampling_prefer_stochastic(tool_agent_info* agent_info, picked_cfg->method, picked_cfg->unit, interval, - buffer_id); + buffer_id, + 0); if(status == ROCPROFILER_STATUS_SUCCESS) { *utils::get_output_stream() @@ -305,24 +306,25 @@ rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /*context_id*/, } else if(cur_header->category == ROCPROFILER_BUFFER_CATEGORY_PC_SAMPLING) { - if(cur_header->kind == ROCPROFILER_PC_SAMPLING_RECORD_SAMPLE) + if(cur_header->kind == ROCPROFILER_PC_SAMPLING_RECORD_HOST_TRAP_V0_SAMPLE) { - auto* pc_sample = - static_cast(cur_header->payload); + auto* pc_sample = static_cast( + cur_header->payload); - ss << "(code_obj_id, offset): (" << pc_sample->pc.loaded_code_object_id << ", 0x" - << std::hex << pc_sample->pc.loaded_code_object_offset << "), " + ss << "(code_obj_id, offset): (" << pc_sample->pc.code_object_id << ", 0x" + << std::hex << pc_sample->pc.code_object_offset << "), " << "timestamp: " << std::dec << pc_sample->timestamp << ", " << "exec: " << std::hex << std::setw(16) << pc_sample->exec_mask << ", " << "workgroup_id_(x=" << std::dec << std::setw(5) << pc_sample->workgroup_id.x << ", " << "y=" << std::setw(5) << pc_sample->workgroup_id.y << ", " << "z=" << std::setw(5) << pc_sample->workgroup_id.z << "), " - << "wave_id: " << std::setw(2) << static_cast(pc_sample->wave_id) + << "wave_in_group: " << std::setw(2) + << static_cast(pc_sample->wave_in_group) << ", " + << "chiplet: " << std::setw(2) + << static_cast(pc_sample->hw_id.chiplet) << ", " - << "chiplet: " << std::setw(2) << static_cast(pc_sample->chiplet) - << ", " - << "cu_id: " << pc_sample->hw_id << ", " + // << "cu_id: " << pc_sample->hw_id << ", " << "correlation: {internal=" << std::setw(7) << pc_sample->correlation_id.internal << ", " << "external=" << std::setw(5) << pc_sample->correlation_id.external.value << "}" diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h index d2206c1784..31226e5fce 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h @@ -439,7 +439,8 @@ typedef enum typedef enum { ROCPROFILER_PC_SAMPLING_RECORD_NONE = 0, - ROCPROFILER_PC_SAMPLING_RECORD_SAMPLE, ///< ::rocprofiler_pc_sampling_record_t + ROCPROFILER_PC_SAMPLING_RECORD_HOST_TRAP_V0_SAMPLE, ///< ::rocprofiler_pc_sampling_record_host_trap_v0_t + ROCPROFILER_PC_SAMPLING_RECORD_STOCHASTIC_V0_SAMPLE, ///< for the future use ROCPROFILER_PC_SAMPLING_RECORD_LAST, } rocprofiler_pc_sampling_record_kind_t; diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/pc_sampling.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/pc_sampling.h index 802dbcecc8..87abd4a92e 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/pc_sampling.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/pc_sampling.h @@ -99,6 +99,7 @@ ROCPROFILER_EXTERN_C_INIT * @param [in] unit - The unit appropriate to the PC sampling type/method. * @param [in] interval - frequency at which PC samples are generated * @param [in] buffer_id - id of the buffer used for delivering PC samples + * @param [in] flags - for future use * @return ::rocprofiler_status_t * @retval ::ROCPROFILER_STATUS_SUCCESS PC sampling service configured successfully * @retval ::ROCPROFILER_STATUS_ERROR_NOT_AVAILABLE One of the scenarios is present: @@ -117,7 +118,8 @@ rocprofiler_configure_pc_sampling_service(rocprofiler_context_id_t conte rocprofiler_pc_sampling_method_t method, rocprofiler_pc_sampling_unit_t unit, uint64_t interval, - rocprofiler_buffer_id_t buffer_id) ROCPROFILER_API; + rocprofiler_buffer_id_t buffer_id, + int flags) ROCPROFILER_API; /** * @brief PC sampling configuration supported by a GPU agent. @@ -195,122 +197,69 @@ rocprofiler_query_pc_sampling_agent_configurations( void* user_data) ROCPROFILER_API ROCPROFILER_NONNULL(2, 3); /** - * @brief The header of the @ref rocprofiler_pc_sampling_record_t, indicating - * what fields of the @ref rocprofiler_pc_sampling_record_t instance are meaningful - * for the sample. + * @brief Information about the GPU part where wave was executing + * at the moment of sampling. */ -typedef struct +typedef struct rocprofiler_pc_sampling_hw_id_v0_t { - uint8_t valid : 1; /// ::rocprofiler_pc_sampling_snapshot_v1_t field is valid - uint8_t type : 4; - uint8_t has_stall_reason : 1; - uint8_t has_wave_cnt : 1; - uint8_t reserved : 1; /// for future use - - /// @var type - /// @brief The following values are possible: - /// - 0 - reserved - /// - 1 - host trap pc sample - /// - 2 - stochastic pc sample - /// - 3 - perfcounter (unsupported at the moment) - /// - other values does not mean anything at the moment - /// @var has_stall_reason - /// @brief whether the sample contains information about the stall reason. - /// If so, please @see rocprofiler_pc_sampling_snapshot_v1_t. - /// @var has_wave_cnt - /// @brief whether the @ref rocprofiler_pc_sampling_record_t::wave_count - /// contains meaningful value -} rocprofiler_pc_sampling_header_v1_t; - -/** - * @brief For future use. - * - * @todo: Provide the description - * @todo: Should we use bitfields because of C ABI portability? - * @todo: Should we abstract this to be architecture agnostic? - * @todo: Consider having a query to determine organization of this information. - */ -typedef struct -{ - uint32_t dual_issue_valu : 1; - uint32_t inst_type : 4; - uint32_t reason_not_issued : 7; - uint32_t arb_state_issue : 10; - uint32_t arb_state_stall : 10; -} rocprofiler_pc_sampling_snapshot_v1_t; + uint64_t chiplet : 6; ///< chiplet index (3 bits allocated by the ROCr runtime) + uint64_t wave_id : 7; ///< wave slot index + uint64_t simd_id : 2; ///< SIMD index + uint64_t pipe_id : 4; ///< pipe index + uint64_t cu_or_wgp_id : 4; ///< Index of compute unit on GFX9 or workgroup processer on other + ///< architectures + uint64_t shader_array_id : 1; ///< Shared array index + uint64_t shader_engine_id : 5; ///< shared engine index + uint64_t workgroup_id : 7; ///< thread_group index on GFX9, and workgroup index on GFX10+ + uint64_t vm_id : 6; ///< virtual memory ID + uint64_t queue_id : 4; ///< queue id + uint64_t microengine_id : 2; ///< ACE (microengine) index + uint64_t reserved0 : 16; ///< Reserved for the future use +} rocprofiler_pc_sampling_hw_id_v0_t; /** * @brief Sampled program counter. */ typedef struct { - uint64_t loaded_code_object_id; - uint64_t loaded_code_object_offset; + uint64_t code_object_id; + uint64_t code_object_offset; - /// @var loaded_code_object_id + /// @var code_object_id /// @brief id of the loaded code object instance that contains sampled PC. /// This fields holds the value ::ROCPROFILER_CODE_OBJECT_ID_NONE /// if the code object cannot be determined /// (e.g., sampled PC belongs to code generated by self modifying code). - /// @var loaded_code_object_offset - /// @brief If @ref loaded_code_object_id is different than ::ROCPROFILER_CODE_OBJECT_ID_NONE, + /// @var code_object_offset + /// @brief If @ref code_object_id is different than ::ROCPROFILER_CODE_OBJECT_ID_NONE, /// then this field contains the offset of the sampled PC relative to the /// ::rocprofiler_callback_tracing_code_object_load_data_t::load_base - /// of the code object instance with @ref loaded_code_object_id. + /// of the code object instance with @ref code_object_id. /// To calculate the original virtual address of the sampled PC, one can add the value /// of this field to the ::rocprofiler_callback_tracing_code_object_load_data_t::load_base. - /// The value of @ref loaded_code_object_offset matches + /// The value of @ref code_object_offset matches /// the virtual address of the sampled instruction (PC), only if the - /// @ref loaded_code_object_id is equal to the ::ROCPROFILER_CODE_OBJECT_ID_NONE. + /// @ref code_object_id is equal to the ::ROCPROFILER_CODE_OBJECT_ID_NONE. } rocprofiler_pc_t; -// TODO: The definition of this structure might change over time -// to reduce the space needed to represent a single sample. +// TODO: The definition of this struct might change over time. /** - * @brief ROCProfiler PC Sampling Record corresponding to the interrupted wave. + * @brief ROCProfiler Host-Trap PC Sampling Record. */ -typedef struct +typedef struct rocprofiler_pc_sampling_record_host_trap_v0_t { - uint64_t size; ///< Size of this struct - rocprofiler_pc_sampling_header_v1_t flags; - uint8_t chiplet; ///< chiplet index - uint8_t wave_id; ///< wave identifier within the workgroup - uint8_t wave_issued : 1; - uint8_t reserved : 7; ///< reserved 7 bits, must be zero - uint32_t hw_id; ///< compute unit identifier - rocprofiler_pc_t pc; ///< information about sampled program counter - uint64_t exec_mask; - rocprofiler_dim3_t workgroup_id; ///< wave coordinates within the workgroup - uint32_t wave_count; - uint64_t timestamp; ///< timestamp when sample is generated - rocprofiler_correlation_id_t correlation_id; - rocprofiler_pc_sampling_snapshot_v1_t - snapshot; ///< @see ::rocprofiler_pc_sampling_snapshot_v1_t - uint32_t reserved2; ///< for future use - - /// @var flags - /// @brief indicates what fields of this struct are meaningful for the represented sample. - /// The values depend on what the underlying GPU agent architecture supports. - /// @var wave_issued - /// @brief indicates whether the wave is issueing the instruction represented by the @ref pc - /// @var exec_mask - /// @brief shows how many SIMD lanes of the wave were executing the instruction - /// represented by the @ref pc. Useful to understand thread-divergance within the wave - /// @var wave_count - /// @brief number of active waves on the CU at the moment of sample generation - /// @var correlation_id - /// @brief correlation id of the API call that initiated a dispatch of the kernel - /// during whose execution the wave was interrupted at @ref pc. -} rocprofiler_pc_sampling_record_t; + uint64_t size; ///< Size of this struct + rocprofiler_pc_sampling_hw_id_v0_t hw_id; ///< @see ::rocprofiler_pc_sampling_hw_id_0_t + rocprofiler_pc_t pc; ///< information about sampled program counter + uint64_t exec_mask; ///< active SIMD lanes when sampled + uint64_t timestamp; ///< timestamp when sample is generated + uint64_t dispatch_id; ///< originating kernel dispatch ID + rocprofiler_correlation_id_t correlation_id; ///< API launch call id that matches dispatch ID + rocprofiler_dim3_t workgroup_id; ///< wave coordinates within the workgroup + uint32_t wave_in_group : 8; ///< wave position within the workgroup (0-31) + uint32_t reserved0 : 24; ///< wave position within the workgroup (0-31) +} rocprofiler_pc_sampling_record_host_trap_v0_t; /** @} */ ROCPROFILER_EXTERN_C_FINI - -ROCPROFILER_CXX_CODE( - static_assert(sizeof(rocprofiler_pc_sampling_record_t) == 88, - "Increasing the size of the pc sampling record is not permitted.")); - -ROCPROFILER_CXX_CODE(static_assert(offsetof(rocprofiler_pc_sampling_record_t, chiplet) == 9 && - offsetof(rocprofiler_pc_sampling_record_t, reserved2) == 84, - "PC sampling record layout changed.")); diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling.cpp index f4a749a50b..d4cf03d35e 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling.cpp @@ -63,7 +63,8 @@ rocprofiler_configure_pc_sampling_service(rocprofiler_context_id_t conte rocprofiler_pc_sampling_method_t method, rocprofiler_pc_sampling_unit_t unit, uint64_t interval, - rocprofiler_buffer_id_t buffer_id) + rocprofiler_buffer_id_t buffer_id, + int /*flags*/) { if(!is_pc_sampling_explicitly_enabled()) return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/CMakeLists.txt index d73469a06d..29eb608a82 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/CMakeLists.txt +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/CMakeLists.txt @@ -1,7 +1,7 @@ set(ROCPROFILER_LIB_PC_SAMPLING_PARSER_SOURCES pc_record_interface.cpp) set(ROCPROFILER_LIB_PC_SAMPLING_PARSER_HEADERS - correlation.hpp gfx9.hpp gfx11.hpp parser_types.h pc_record_interface.hpp rocr.h - translation.hpp) + correlation.hpp gfx9.hpp gfx11.hpp parser_types.hpp pc_record_interface.hpp rocr.h + stochastic_records.h translation.hpp) target_sources( rocprofiler-sdk-object-library PRIVATE ${ROCPROFILER_LIB_PC_SAMPLING_PARSER_SOURCES} diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/correlation.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/correlation.hpp index 8aa22192f9..2a35198732 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/correlation.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/correlation.hpp @@ -205,13 +205,13 @@ private: using address_range_t = rocprofiler::sdk::codeobj::segment::address_range_t; -template +template inline pcsample_status_t -add_upcoming_samples(const device_handle device, - const generic_sample_t* buffer, - const size_t available_samples, - Parser::CorrelationMap* corr_map, - rocprofiler_pc_sampling_record_t* samples) +add_upcoming_samples(const device_handle device, + const generic_sample_t* buffer, + const size_t available_samples, + Parser::CorrelationMap* corr_map, + PcSamplingRecordT* samples) { pcsample_status_t status = PCSAMPLE_STATUS_SUCCESS; auto cache_addr_range = address_range_t{0, 0, ROCPROFILER_CODE_OBJECT_ID_NONE}; @@ -226,15 +226,14 @@ add_upcoming_samples(const device_handle device, const auto* snap = reinterpret_cast(buffer + p); auto& pc_sample = samples[p]; - pc_sample = copySample((const void*) (buffer + p)); - pc_sample.size = sizeof(rocprofiler_pc_sampling_record_t); + pc_sample = copySample((const void*) (buffer + p)); // Convert PC -> (loaded code object id containing PC, offset within code object) if(!cache_addr_range.inrange(snap->pc)) cache_addr_range = table->find_codeobj_in_range(snap->pc); - pc_sample.pc.loaded_code_object_id = cache_addr_range.id; - pc_sample.pc.loaded_code_object_offset = snap->pc - cache_addr_range.addr; + pc_sample.pc.code_object_id = cache_addr_range.id; + pc_sample.pc.code_object_offset = snap->pc - cache_addr_range.addr; try { @@ -251,13 +250,13 @@ add_upcoming_samples(const device_handle device, return status; } -template +template inline pcsample_status_t -_parse_buffer(generic_sample_t* buffer, - uint64_t buffer_size, - user_callback_t callback, - void* userdata, - Parser::CorrelationMap* corr_map) +_parse_buffer(generic_sample_t* buffer, + uint64_t buffer_size, + user_callback_t callback, + void* userdata, + Parser::CorrelationMap* corr_map) { // Maximum size uint64_t index = 0; @@ -283,26 +282,31 @@ _parse_buffer(generic_sample_t* buffer, uint64_t pkt_counter = pkt.num_samples; if(index + pkt_counter > buffer_size) return PCSAMPLE_STATUS_OUT_OF_BOUNDS_ERROR; - bool bIsHostTrap = pkt.which_sample_type == AMD_HOST_TRAP_V1; + // I don't think we need this. + // bool bIsHostTrap = pkt.which_sample_type == AMD_HOST_TRAP_V1; while(pkt_counter > 0) { - rocprofiler_pc_sampling_record_t* samples = nullptr; + PcSamplingRecordT* samples = nullptr; uint64_t available_samples = callback(&samples, pkt_counter, userdata); if(available_samples == 0 || available_samples > pkt_counter) return PCSAMPLE_STATUS_CALLBACK_ERROR; - if(bIsHostTrap) - { - status |= add_upcoming_samples( - pkt.device, buffer + index, available_samples, corr_map, samples); - } - else - { - status |= add_upcoming_samples( - pkt.device, buffer + index, available_samples, corr_map, samples); - } + // I don't think we need if-else here + // if(bIsHostTrap) + // { + // status |= add_upcoming_samples( + // pkt.device, buffer + index, available_samples, corr_map, samples); + // } + // else + // { + // status |= add_upcoming_samples( + // pkt.device, buffer + index, available_samples, corr_map, samples); + // } + + status |= add_upcoming_samples( + pkt.device, buffer + index, available_samples, corr_map, samples); index += available_samples; pkt_counter -= available_samples; @@ -329,19 +333,20 @@ _parse_buffer(generic_sample_t* buffer, * a size smaller than requested, then it may be called again requesting more memory. * @param[in] userdata parameter forwarded to the user callback. */ -pcsample_status_t inline parse_buffer(generic_sample_t* buffer, - uint64_t buffer_size, - int gfxip_major, - user_callback_t callback, - void* userdata) +template +pcsample_status_t inline parse_buffer(generic_sample_t* buffer, + uint64_t buffer_size, + int gfxip_major, + user_callback_t callback, + void* userdata) { static auto corr_map = std::make_unique(); - auto parseSample_func = _parse_buffer; + auto parseSample_func = _parse_buffer; if(gfxip_major == 9) - parseSample_func = _parse_buffer; + parseSample_func = _parse_buffer; else if(gfxip_major == 11) - parseSample_func = _parse_buffer; + parseSample_func = _parse_buffer; else return PCSAMPLE_STATUS_INVALID_GFXIP; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/parser_types.h b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/parser_types.hpp similarity index 93% rename from projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/parser_types.h rename to projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/parser_types.hpp index 3a2ea0dbc8..952c738064 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/parser_types.h +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/parser_types.hpp @@ -81,13 +81,8 @@ enum pcsample_arb_issue_state }; } // namespace PCSAMPLE -union pcsample_header_v1_t -{ - rocprofiler_pc_sampling_header_v1_t flags; - uint8_t raw; -}; - -typedef uint64_t (*user_callback_t)(rocprofiler_pc_sampling_record_t**, uint64_t, void*); +template +using user_callback_t = uint64_t (*)(PcSamplingRecordT**, uint64_t, void*); /** * The types of errors to be returned by parse_buffer. diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/pc_record_interface.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/pc_record_interface.cpp index 278866ac1c..0247ba4188 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/pc_record_interface.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/pc_record_interface.cpp @@ -22,13 +22,31 @@ #include "lib/rocprofiler-sdk/pc_sampling/parser/pc_record_interface.hpp" +template <> uint64_t -PCSamplingParserContext::alloc(rocprofiler_pc_sampling_record_t** buffer, uint64_t size) +PCSamplingParserContext::alloc( + rocprofiler_pc_sampling_record_host_trap_v0_t** buffer, + uint64_t size) { std::unique_lock lock(mut); assert(buffer != nullptr); - data.emplace_back(std::make_unique(size)); - *buffer = data.back()->samples.data(); + host_trap_data.emplace_back( + std::make_unique>(size)); + *buffer = host_trap_data.back()->samples.data(); + return size; +} + +template <> +uint64_t +PCSamplingParserContext::alloc( + rocprofiler_pc_sampling_record_stochastic_v0_t** buffer, + uint64_t size) +{ + std::unique_lock lock(mut); + assert(buffer != nullptr); + stochastic_data.emplace_back( + std::make_unique>(size)); + *buffer = stochastic_data.back()->samples.data(); return size; } @@ -39,10 +57,21 @@ PCSamplingParserContext::parse(const upcoming_samples_t& upcoming, std::condition_variable& midway_signal, bool bRocrBufferFlip) { + bool bIsHostTrap = upcoming.which_sample_type == AMD_HOST_TRAP_V1; + // Template instantiation is faster! - auto parseSample_func = &PCSamplingParserContext::_parse; + auto parseSample_func = + bIsHostTrap + ? &PCSamplingParserContext::_parse + : &PCSamplingParserContext::_parse; if(gfxip_major == 11) - parseSample_func = &PCSamplingParserContext::_parse; + parseSample_func = + bIsHostTrap + ? &PCSamplingParserContext::_parse + : &PCSamplingParserContext::_parse; else if(gfxip_major != 9) return PCSAMPLE_STATUS_INVALID_GFXIP; @@ -98,11 +127,13 @@ PCSamplingParserContext::shouldFlipRocrBuffer(const dispatch_pkt_id_t& pkt) cons return corr_map->checkDispatch(pkt); } +template void PCSamplingParserContext::generate_upcoming_pc_record( - uint64_t agent_id_handle, - const rocprofiler_pc_sampling_record_t* samples, - size_t num_samples) + uint64_t agent_id_handle, + const PcSamplingRecordKindT* samples, + size_t num_samples, + rocprofiler_pc_sampling_record_kind_t record_kind) { auto buff_id = _agent_buffers.at(rocprofiler_agent_id_t{agent_id_handle}); rocprofiler::buffer::instance* buff = rocprofiler::buffer::get_buffer(buff_id); @@ -111,7 +142,28 @@ PCSamplingParserContext::generate_upcoming_pc_record( throw std::runtime_error(fmt::format("Buffer with id: {} does not exists", buff_id.handle)); for(size_t i = 0; i < num_samples; i++) - buff->emplace(ROCPROFILER_BUFFER_CATEGORY_PC_SAMPLING, - ROCPROFILER_PC_SAMPLING_RECORD_SAMPLE, - samples[i]); + buff->emplace(ROCPROFILER_BUFFER_CATEGORY_PC_SAMPLING, record_kind, samples[i]); +} + +template <> +void +PCSamplingParserContext::generate_upcoming_pc_record( + uint64_t agent_id_handle, + const rocprofiler_pc_sampling_record_host_trap_v0_t* samples, + size_t num_samples) +{ + this->generate_upcoming_pc_record( + agent_id_handle, samples, num_samples, ROCPROFILER_PC_SAMPLING_RECORD_HOST_TRAP_V0_SAMPLE); +} + +template <> +void +PCSamplingParserContext::generate_upcoming_pc_record< + rocprofiler_pc_sampling_record_stochastic_v0_t>( + uint64_t agent_id_handle, + const rocprofiler_pc_sampling_record_stochastic_v0_t* samples, + size_t num_samples) +{ + this->generate_upcoming_pc_record( + agent_id_handle, samples, num_samples, ROCPROFILER_PC_SAMPLING_RECORD_STOCHASTIC_V0_SAMPLE); } diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/pc_record_interface.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/pc_record_interface.hpp index d9b91e073c..4953bb52b8 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/pc_record_interface.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/pc_record_interface.hpp @@ -24,7 +24,8 @@ #include "lib/rocprofiler-sdk/buffer.hpp" #include "lib/rocprofiler-sdk/pc_sampling/parser/correlation.hpp" -#include "lib/rocprofiler-sdk/pc_sampling/parser/parser_types.h" +#include "lib/rocprofiler-sdk/pc_sampling/parser/parser_types.hpp" +#include "lib/rocprofiler-sdk/pc_sampling/parser/stochastic_records.h" #include #include @@ -41,13 +42,14 @@ #include #include +template struct PCSamplingData { PCSamplingData(size_t size) : samples(size){}; PCSamplingData& operator=(PCSamplingData&) = delete; - std::vector samples; + std::vector samples; }; class PCSamplingParserContext @@ -55,13 +57,16 @@ class PCSamplingParserContext public: PCSamplingParserContext() : corr_map(std::make_unique()){}; + /** - * @brief Allocates some memory. TODO: Translate to Jonathan's buffer implementation. + * @brief Allocates some memory for samples. + * TODO: Translate to Jonathan's buffer implementation. * @param[out] buffer Pointer where samples are to be written to. * @param[in] size Number of samples requested. * @returns Number of samples actually allocated on *buffer. */ - uint64_t alloc(rocprofiler_pc_sampling_record_t** buffer, uint64_t size); + template + uint64_t alloc(PcSamplingRecordT** buffer, uint64_t size); /** * @brief Parses a chunk of samples. @@ -127,7 +132,7 @@ protected: * @brief Parses the given input data and generates pc sampling records. * Calls generate_upcoming_pc_record(). */ - template + template pcsample_status_t _parse(const upcoming_samples_t& upcoming, const generic_sample_t* data_) { // std::shared_lock lock(mut); @@ -139,16 +144,16 @@ protected: while(pkt_counter > 0) { - rocprofiler_pc_sampling_record_t* samples = nullptr; - uint64_t memsize = alloc(&samples, pkt_counter); + PcSamplingRecordT* samples = nullptr; + uint64_t memsize = alloc(&samples, pkt_counter); if(memsize == 0 || memsize > pkt_counter) return PCSAMPLE_STATUS_CALLBACK_ERROR; auto* map = corr_map.get(); if(bIsHostTrap) - status |= add_upcoming_samples(dev, data_, memsize, map, samples); + status |= add_upcoming_samples(dev, data_, memsize, map, samples); else - status |= add_upcoming_samples(dev, data_, memsize, map, samples); + status |= add_upcoming_samples(dev, data_, memsize, map, samples); data_ += memsize; pkt_counter -= memsize; @@ -164,14 +169,26 @@ protected: */ pcsample_status_t flushForgetList(); static void generate_id_completion_record(const dispatch_pkt_id_t& pkt) { (void) pkt; }; - void generate_upcoming_pc_record(uint64_t agent_id_handle, - const rocprofiler_pc_sampling_record_t* samples, - size_t num_samples); + + template + void generate_upcoming_pc_record(uint64_t agent_id_handle, + const PcSamplingRecordT* samples, + size_t num_samples); + + template + void generate_upcoming_pc_record(uint64_t agent_id_handle, + const PcSamplingRecordT* samples, + size_t num_samples, + rocprofiler_pc_sampling_record_kind_t record_kind); //! Maps doorbells and dispatch_index to correlation_id std::unique_ptr corr_map; - //! Data allocated to store samples. Temporary. - std::vector> data; + //! Data allocated to store host trap and stochastic samples, respectively. + //! Temporary solution until we figured out a smooth way to copy data directly to SDK's buffers. + std::vector>> + host_trap_data; + std::vector>> + stochastic_data; //! Dispatches not yet completed. // Uses only the internal correlation_id. std::unordered_map active_dispatches; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/stochastic_records.h b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/stochastic_records.h new file mode 100644 index 0000000000..c783565474 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/stochastic_records.h @@ -0,0 +1,180 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include +#include + +#include + +/** + * @brief The header of the @ref rocprofiler_pc_sampling_record_stochastic_v0_t, indicating + * what fields of the @ref rocprofiler_pc_sampling_record_stochastic_v0_t instance are meaningful + * for the sample. + */ +typedef struct rocprofiler_pc_sampling_record_stochastic_header_t +{ + uint8_t valid : 1; ///< pc sample is valid + uint8_t has_memory_counter : 1; ///< pc sample provides memory counters information + ///< via ::rocprofiler_pc_sampling_memory_counters_t + uint8_t reserved_type : 6; +} rocprofiler_pc_sampling_record_stochastic_header_t; + +/** + * @brief Enumaration describing sampled instruction type. + */ +typedef enum rocprofiler_pc_sampling_instruction_type_t +{ + // Do we need ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_NONE=0? (we defined *_NONE in some other + // enums ) If so, then parser needs to add offset +1 after determining the type + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_VALU = 0, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_MATRIX, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_SCALAR, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_TEX, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_LDS, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_LDS_DIRECT, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_FLAT, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_EXPORT, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_MESSAGE, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BARRIER, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BRANCH_NOT_TAKEN, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BRANCH_TAKEN, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_JUMP, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_OTHER, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_NO_INST, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_DUAL_VALU, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_LAST +} rocprofiler_pc_sampling_instruction_type_t; + +/** + * @brief Enumaration describing reason for not issuing an instruction. + */ +typedef enum pcsample_reason_not_issued +{ + // Do we need ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_NONE=0? (we defined *_NONE in some + // other enums ) If so, then parser needs to add offset +1 after determining the reason. + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NOT_AVAILABLE = 0, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ALU, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_WAITCNT, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_INTERNAL, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_BARRIER, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_EX_STALL, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_OTHER_WAIT, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_SLEEP, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_LAST +} rocprofiler_pc_sampling_instruction_not_issued_reason_t; + +/** + * @brief Data provided by stochastic sampling hardware. + * + */ +typedef struct rocprofiler_pc_sampling_snapshot_v0_t +{ + uint32_t + reason_not_issued : 4; ///< The reason for not issuing an instruction. + ///< (9 different issue reason fits in 4 bits) + ///< The field takes one of the value defined in + ///< @ref ::rocprofiler_pc_sampling_instruction_not_issued_reason_t + uint32_t reserved0 : 1; ///< reserved for future use + uint32_t arb_state_issue_valu : 1; ///< arbiter issued a VALU instruction + uint32_t arb_state_issue_matrix : 1; ///< arbiter issued a matrix instruction + uint32_t arb_state_issue_lds : 1; ///< arbiter issued a LDS instruction + uint32_t arb_state_issue_lds_direct : 1; ///< arbiter issued a LDS direct instruction + uint32_t arb_state_issue_scalar : 1; ///< arbiter issued a scalar (SALU/SMEM) instruction + uint32_t arb_state_issue_vmem_tex : 1; ///< arbiter issued a texture instruction + uint32_t arb_state_issue_flat : 1; ///< arbiter issued a FLAT instruction + uint32_t arb_state_issue_exp : 1; ///< arbiter issued a export instruction + uint32_t arb_state_issue_misc : 1; ///< arbiter issued a miscellaneous instruction + uint32_t arb_state_issue_brmsg : 1; ///< arbiter issued a branch/message instruction + uint32_t arb_state_issue_reserved : 1; ///< reserved for the future use + // Replacing `uint32_t arb_state_stall : 10;` + uint32_t arb_state_stall_valu : 1; ///< VALU instruction was stalled when sampled is generated + uint32_t + arb_state_stall_matrix : 1; ///< matrix instruction was stalled when sampled is generated + uint32_t arb_state_stall_lds : 1; ///< LDS instruction was stalled when sampled is generated + uint32_t arb_state_stall_lds_direct : 1; ///< LDS direct instruction was stalled when sampled + ///< is generated + uint32_t arb_state_stall_scalar : 1; ///< Scalar (SALU/SMEM) instruction was stalled when + ///< sampled is generated + uint32_t arb_state_stall_vmem_tex : 1; ///< texture instruction was stalled when sampled is + ///< generated + uint32_t arb_state_stall_flat : 1; ///< flat instruction was stalled when sampled is generated + uint32_t arb_state_stall_exp : 1; ///< export instruction was stalled when sampled is generated + uint32_t arb_state_stall_misc : 1; ///< miscellaneous instruction was stalled when sampled is + ///< generated + uint32_t arb_state_stall_brmsg : 1; ///< branch/message instruction was stalled when sampled is + ///< generated + uint32_t arb_state_state_reserved : 1; ///< reserved for the future use + // We have two reserved bits + uint32_t + dual_issue_valu : 1; ///< two VALU instructions issued for coexecution (MI3xx specific) + uint32_t reserved1 : 1; ///< reserved for the future use + uint32_t reserved2 : 3; ///< reserved for the future use +} rocprofiler_pc_sampling_snapshot_v0_t; + +/** + * @brief Counters of issued instructions. + */ +typedef struct rocprofiler_pc_sampling_memory_counters_t +{ + uint32_t load_cnt : 6; ///< Counts the number of VMEM load instructions issued but not yet + ///< completed. + uint32_t store_cnt : 6; ///< Counts the number of VMEM store instructions issued but not yet + ///< completed. + uint32_t + bvh_cnt : 3; ///< Counts the number of VMEM BVH instructions issued but not yet completed. + uint32_t sample_cnt : 6; ///< Counts the number of VMEM sample instructions issued but not yet + ///< completed. + uint32_t ds_cnt : 6; ///< Counts the number of LDS instructions issued but not yet completed. + uint32_t km_cnt : 5; ///< Counts the number of scalar memory reads and memory instructions + ///< issued but not yet completed. +} rocprofiler_pc_sampling_memory_counters_t; + +/** + * @brief ROCProfiler Stochastic PC Sampling Record. + */ +typedef struct rocprofiler_pc_sampling_record_stochastic_v0_t +{ + // TODO: use size to know whether memory counters exist or not + uint64_t size; ///< Size of this struct + rocprofiler_pc_sampling_record_stochastic_header_t + flags; ///< defines what fields are relevant for the sample + uint8_t wave_in_group; ///< wave position within the workgroup (0-15) + uint8_t wave_issued : 1; ///< wave issued the instruction represented with the PC + uint8_t inst_type : 5; ///< instruction type, takes a value defined in @ref + ///< ::rocprofiler_pc_sampling_instruction_type_t + uint8_t reserved : 2; ///< reserved 2 bits must be zero + rocprofiler_pc_sampling_hw_id_v0_t hw_id; ///< @see ::rocprofiler_pc_sampling_hw_id_v0_t + rocprofiler_pc_t pc; ///< information about sampled program counter + uint64_t exec_mask; ///< active SIMD lanes at the moment sampling + rocprofiler_dim3_t workgroup_id; ///< wave coordinates within the workgroup + uint32_t wave_count; /// active waves on the CU at the moment of sampling + uint64_t timestamp; ///< timestamp when sample is generated + uint64_t dispatch_id; ///< originating kernel dispatch ID + rocprofiler_correlation_id_t correlation_id; + rocprofiler_pc_sampling_snapshot_v0_t + snapshot; ///< @see ::rocprofiler_pc_sampling_snapshot_v0_t + rocprofiler_pc_sampling_memory_counters_t + memory_counters; ///< @see ::rocprofiler_pc_sampling_memory_counters_t +} rocprofiler_pc_sampling_record_stochastic_v0_t; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/benchmark_test.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/benchmark_test.cpp index 4c9248998a..78a115ef1e 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/benchmark_test.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/benchmark_test.cpp @@ -31,6 +31,7 @@ * Benchmarks how fast the parser can process samples on a single threaded case * Current: 5600X with -Ofast, up to >140 million samples/s or ~9GB/s R/W (18GB/s bidirectional) */ +template static bool Benchmark(bool bWarmup) { @@ -38,14 +39,16 @@ Benchmark(bool bWarmup) constexpr size_t DISP_PER_QUEUE = 8; constexpr size_t NUM_QUEUES = 4; - std::shared_ptr buffer = std::make_shared(); - std::array>, NUM_QUEUES> active_dispatches; + auto buffer = std::make_shared>(); + std::array>>, NUM_QUEUES> + active_dispatches; for(size_t q = 0; q < NUM_QUEUES; q++) { - std::shared_ptr queue = std::make_shared(DISP_PER_QUEUE * 2, buffer); + auto queue = std::make_shared>(DISP_PER_QUEUE * 2, buffer); for(size_t d = 0; d < DISP_PER_QUEUE; d++) - active_dispatches[q].push_back(std::make_shared(queue)); + active_dispatches[q].push_back( + std::make_shared>(queue)); } constexpr size_t TOTAL_NUM_SAMPLES = NUM_QUEUES * DISP_PER_QUEUE * SAMPLE_PER_DISPATCH; @@ -56,23 +59,24 @@ Benchmark(bool bWarmup) for(size_t i = 0; i < SAMPLE_PER_DISPATCH; i++) MockWave(dispatch).genPCSample(); - std::pair userdata; - userdata.first = new rocprofiler_pc_sampling_record_t[TOTAL_NUM_SAMPLES]; + std::pair userdata; + userdata.first = new PcSamplingRecordT[TOTAL_NUM_SAMPLES]; userdata.second = TOTAL_NUM_SAMPLES; - auto t0 = std::chrono::system_clock::now(); - CHECK_PARSER(parse_buffer( - (generic_sample_t*) buffer->packets.data(), - buffer->packets.size(), - GFXIP_MAJOR, - [](rocprofiler_pc_sampling_record_t** sample, uint64_t size, void* userdata_) { - auto* pair = - reinterpret_cast*>(userdata_); + user_callback_t user_cb = + [](PcSamplingRecordT** sample, uint64_t size, void* userdata_) { + auto* pair = reinterpret_cast*>(userdata_); assert(TOTAL_NUM_SAMPLES == pair->second); *sample = pair->first; return size; - }, - &userdata)); + }; + + auto t0 = std::chrono::system_clock::now(); + CHECK_PARSER(parse_buffer((generic_sample_t*) buffer->packets.data(), + buffer->packets.size(), + GFXIP_MAJOR, + user_cb, + &userdata)); auto t1 = std::chrono::system_clock::now(); float samples_per_us = float(TOTAL_NUM_SAMPLES) / (t1 - t0).count() * 1E3f; @@ -80,8 +84,7 @@ Benchmark(bool bWarmup) { std::cout << "Benchmark: Parsed " << int(samples_per_us * 1E3f + 0.5f) * 1E-3f << " Msample/s ("; - std::cout << int(sizeof(rocprofiler_pc_sampling_record_t) * samples_per_us) << " MB/s)" - << std::endl; + std::cout << int(sizeof(PcSamplingRecordT) * samples_per_us) << " MB/s)" << std::endl; } delete[] userdata.first; @@ -90,7 +93,12 @@ Benchmark(bool bWarmup) TEST(pcs_parser, benchmark_test) { - EXPECT_EQ(Benchmark(true), true); - EXPECT_EQ(Benchmark(false), true); - EXPECT_EQ(Benchmark(false), true); + // Tests for host trap v0 records + EXPECT_EQ(Benchmark(true), true); + EXPECT_EQ(Benchmark(false), true); + EXPECT_EQ(Benchmark(false), true); + // tests for stochastic v0 records + EXPECT_EQ(Benchmark(true), true); + EXPECT_EQ(Benchmark(false), true); + EXPECT_EQ(Benchmark(false), true); } diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/correlation_id_test.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/correlation_id_test.cpp index df9f072797..eeb45d2db1 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/correlation_id_test.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/correlation_id_test.cpp @@ -33,15 +33,15 @@ std::mt19937 rdgen(1); /** * Sample user memory allocation callback. * It expects userdata to be cast-able to a pointer to - * std::vector> + * std::vector> */ +template static uint64_t -alloc_callback(rocprofiler_pc_sampling_record_t** buffer, uint64_t size, void* userdata) +alloc_callback(PcSamplingRecordT** buffer, uint64_t size, void* userdata) { - *buffer = new rocprofiler_pc_sampling_record_t[size]; + *buffer = new PcSamplingRecordT[size]; auto& vector = - *reinterpret_cast>*>( - userdata); + *reinterpret_cast>*>(userdata); vector.push_back({*buffer, size}); return size; } @@ -50,35 +50,34 @@ alloc_callback(rocprofiler_pc_sampling_record_t** buffer, uint64_t size, void* u * Uses the MockWave dispatch's unique_id store in the pc field to verify * the reconstructed correlation_id. */ +template static bool -check_samples(rocprofiler_pc_sampling_record_t* samples, uint64_t size) +check_samples(PcSamplingRecordT* samples, uint64_t size) { // TODO: replace with (code_obj_id, pc) for(size_t i = 0; i < size; i++) - if(samples[i].correlation_id.internal != samples[i].pc.loaded_code_object_offset) - return false; + if(samples[i].correlation_id.internal != samples[i].pc.code_object_offset) return false; return true; } -/** - * Simplest mock classes use, generates a single queue+dispatch with 2 PC samples. - */ -TEST(pcs_parser, hello_world) +template +void +pcs_parser_hello_world() { - std::shared_ptr buffer = std::make_shared(); - std::shared_ptr queue = std::make_shared(16, buffer); - std::shared_ptr dispatch = std::make_shared(queue); + auto buffer = std::make_shared>(); + auto queue = std::make_shared>(16, buffer); + auto dispatch = std::make_shared>(queue); buffer->genUpcomingSamples(2); MockWave(dispatch).genPCSample(); MockWave(dispatch).genPCSample(); - std::vector> all_allocations; + std::vector> all_allocations; CHECK_PARSER(parse_buffer((generic_sample_t*) buffer->packets.data(), buffer->packets.size(), GFXIP_MAJOR, - alloc_callback, + alloc_callback, (void*) &all_allocations)); EXPECT_EQ(all_allocations.size(), 1); // HelloWorld: Incorrect number of callbacks @@ -91,23 +90,34 @@ TEST(pcs_parser, hello_world) } } +/** + * Simplest mock classes use, generates a single queue+dispatch with 2 PC samples. + */ +TEST(pcs_parser, hello_world) +{ + pcs_parser_hello_world(); + pcs_parser_hello_world(); +} + /** * A little more complicated. * Generates a few dispatches for 2 different queues and samples in forward and reverse order. * Checks if the reconstructed correlation_id is correct. */ -TEST(pcs_parser, reverse_wave_order) +template +void +pcs_parser_reverse_wave_order() { - std::shared_ptr buffer = std::make_shared(); - std::shared_ptr queue1 = std::make_shared(16, buffer); - std::shared_ptr queue2 = std::make_shared(16, buffer); + auto buffer = std::make_shared>(); + auto queue1 = std::make_shared>(16, buffer); + auto queue2 = std::make_shared>(16, buffer); - std::vector> dispatches; - dispatches.push_back(std::make_shared(queue1)); - dispatches.push_back(std::make_shared(queue1)); - dispatches.push_back(std::make_shared(queue2)); - dispatches.push_back(std::make_shared(queue2)); - dispatches.push_back(std::make_shared(queue1)); + std::vector>> dispatches; + dispatches.push_back(std::make_shared>(queue1)); + dispatches.push_back(std::make_shared>(queue1)); + dispatches.push_back(std::make_shared>(queue2)); + dispatches.push_back(std::make_shared>(queue2)); + dispatches.push_back(std::make_shared>(queue1)); buffer->genUpcomingSamples(dispatches.size()); for(auto it = dispatches.rbegin(); it != dispatches.rend(); it++) @@ -116,12 +126,12 @@ TEST(pcs_parser, reverse_wave_order) for(auto it = dispatches.begin(); it != dispatches.end(); it++) MockWave(*it).genPCSample(); - std::vector> all_allocations; + std::vector> all_allocations; CHECK_PARSER(parse_buffer((generic_sample_t*) buffer->packets.data(), buffer->packets.size(), GFXIP_MAJOR, - alloc_callback, + alloc_callback, (void*) &all_allocations)); EXPECT_EQ(all_allocations.size(), 2); // ReverseWaveOrder test: Incorrect number of callbacks @@ -135,29 +145,33 @@ TEST(pcs_parser, reverse_wave_order) } } -/** - * Creates a small queue and causes the dispatch_ids to wrap around a few times, and generates - * a single sample per dispatch. Checks the parser is properly handling the wrapping of queues. - */ -TEST(pcs_parser, dispatch_wrapping) +TEST(pcs_parser, reverse_wave_order) { - const int num_samples = 32; - std::shared_ptr buffer = std::make_shared(); - std::shared_ptr queue = std::make_shared(5, buffer); + pcs_parser_reverse_wave_order(); + pcs_parser_reverse_wave_order(); +} + +template +void +pcs_parser_dispatch_wrapping() +{ + const int num_samples = 32; + auto buffer = std::make_shared>(); + auto queue = std::make_shared>(5, buffer); for(int i = 0; i < num_samples; i++) { - auto dispatch = std::make_shared(queue); + auto dispatch = std::make_shared>(queue); buffer->genUpcomingSamples(1); MockWave(dispatch).genPCSample(); } - std::vector> all_allocations; + std::vector> all_allocations; CHECK_PARSER(parse_buffer((generic_sample_t*) buffer->packets.data(), buffer->packets.size(), GFXIP_MAJOR, - alloc_callback, + alloc_callback, (void*) &all_allocations)); EXPECT_EQ(all_allocations.size(), @@ -172,39 +186,47 @@ TEST(pcs_parser, dispatch_wrapping) } /** - * Creates a few queues with a few dispatchs per queue. - * Adds random samples per dispatch, and checks the result. + * Creates a small queue and causes the dispatch_ids to wrap around a few times, and generates + * a single sample per dispatch. Checks the parser is properly handling the wrapping of queues. */ -TEST(pcs_parser, random_samples) +TEST(pcs_parser, dispatch_wrapping) { - const int num_samples = 1024; - std::shared_ptr buffer = std::make_shared(); - std::shared_ptr queue1 = std::make_shared(16, buffer); - std::shared_ptr queue2 = std::make_shared(16, buffer); - std::shared_ptr queue3 = std::make_shared(16, buffer); - std::shared_ptr queue4 = std::make_shared(16, buffer); + pcs_parser_dispatch_wrapping(); + pcs_parser_dispatch_wrapping(); +} - std::vector> dispatches; - dispatches.push_back(std::make_shared(queue1)); - dispatches.push_back(std::make_shared(queue1)); - dispatches.push_back(std::make_shared(queue2)); - dispatches.push_back(std::make_shared(queue3)); - dispatches.push_back(std::make_shared(queue1)); - dispatches.push_back(std::make_shared(queue3)); - dispatches.push_back(std::make_shared(queue3)); - dispatches.push_back(std::make_shared(queue2)); - dispatches.push_back(std::make_shared(queue1)); +template +void +pcs_parser_random_samples() +{ + const int num_samples = 1024; + auto buffer = std::make_shared>(); + auto queue1 = std::make_shared>(16, buffer); + auto queue2 = std::make_shared>(16, buffer); + auto queue3 = std::make_shared>(16, buffer); + auto queue4 = std::make_shared>(16, buffer); + + std::vector>> dispatches; + dispatches.push_back(std::make_shared>(queue1)); + dispatches.push_back(std::make_shared>(queue1)); + dispatches.push_back(std::make_shared>(queue2)); + dispatches.push_back(std::make_shared>(queue3)); + dispatches.push_back(std::make_shared>(queue1)); + dispatches.push_back(std::make_shared>(queue3)); + dispatches.push_back(std::make_shared>(queue3)); + dispatches.push_back(std::make_shared>(queue2)); + dispatches.push_back(std::make_shared>(queue1)); buffer->genUpcomingSamples(num_samples); for(int i = 0; i < num_samples; i++) MockWave(dispatches[rdgen() % dispatches.size()]).genPCSample(); - std::vector> all_allocations; + std::vector> all_allocations; CHECK_PARSER(parse_buffer((generic_sample_t*) buffer->packets.data(), buffer->packets.size(), GFXIP_MAJOR, - alloc_callback, + alloc_callback, (void*) &all_allocations)); EXPECT_EQ(all_allocations.size(), 1); // RandomSamples test: Incorrect number of callbacks @@ -218,21 +240,29 @@ TEST(pcs_parser, random_samples) } /** - * Hammers the parser by creating and destrying queues at random, adding dispatches at random - * and generating PC samples at random. By default we use all 4 unique doorbells, - * queue size is 16 and we generate 10k samples dispatch. + * Creates a few queues with a few dispatchs per queue. + * Adds random samples per dispatch, and checks the result. */ -TEST(pcs_parser, queue_hammer) +TEST(pcs_parser, random_samples) +{ + pcs_parser_random_samples(); + pcs_parser_random_samples(); +} + +template +void +pcs_parser_queue_hammer() { constexpr int NUM_ACTIONS = 10000; constexpr int QSIZE = 16; constexpr int NUM_QUEUES = MockDoorBell::num_unique_bells; constexpr int ACTION_MAX = QSIZE * NUM_QUEUES / 2; - std::shared_ptr buffer = std::make_shared(); + auto buffer = std::make_shared>(); - std::array, NUM_QUEUES> queues; - std::array>, NUM_QUEUES> active_dispatches; + std::array>, NUM_QUEUES> queues; + std::array>>, NUM_QUEUES> + active_dispatches; int num_reset_queues = 0; int num_samples_generated = 0; @@ -241,9 +271,10 @@ TEST(pcs_parser, queue_hammer) size_t max_q_occupancy = 0; for(int i = 0; i < NUM_QUEUES; i++) - queues[i] = std::make_shared(QSIZE, buffer); + queues[i] = std::make_shared>(QSIZE, buffer); for(int i = 0; i < NUM_QUEUES; i++) - active_dispatches[i].push_back(std::make_shared(queues[i])); + active_dispatches[i].push_back( + std::make_shared>(queues[i])); for(int i = 0; i < NUM_ACTIONS; i++) { @@ -254,7 +285,7 @@ TEST(pcs_parser, queue_hammer) // Delete queue and create new one active_dispatches[q] = {}; queues[q].reset(); - queues[q] = std::make_shared(QSIZE, buffer); + queues[q] = std::make_shared>(QSIZE, buffer); num_reset_queues++; } else if(action > ACTION_MAX / 2 && active_dispatches[q].size() > 1) @@ -267,7 +298,8 @@ TEST(pcs_parser, queue_hammer) // Add new dispatch if(active_dispatches[q].size() < QSIZE) { - active_dispatches[q].push_back(std::make_shared(queues[q])); + active_dispatches[q].push_back( + std::make_shared>(queues[q])); num_dispatches_generated += 1; } @@ -276,7 +308,8 @@ TEST(pcs_parser, queue_hammer) for(auto& queue : active_dispatches) { EXPECT_NE(queue.size(), 0); - std::shared_ptr rand_dispatch = queue[rdgen() % queue.size()]; + std::shared_ptr> rand_dispatch = + queue[rdgen() % queue.size()]; MockWave(rand_dispatch).genPCSample(); num_samples_generated += 1; avg_q_occupancy += queue.size(); @@ -292,20 +325,20 @@ TEST(pcs_parser, queue_hammer) << std::endl; std::cout << "Max queue occupancy: " << max_q_occupancy << "\n\n" << std::endl; - std::vector> all_allocations; + std::vector> all_allocations; CHECK_PARSER(parse_buffer((generic_sample_t*) buffer->packets.data(), buffer->packets.size(), GFXIP_MAJOR, - alloc_callback, + alloc_callback, (void*) &all_allocations)); EXPECT_EQ(all_allocations.size(), NUM_ACTIONS); // QueueHammer test: Incorrect number of callbacks for(auto sb = 0ul; sb < all_allocations.size(); sb++) { - rocprofiler_pc_sampling_record_t* samples = all_allocations[sb].first; - size_t num_samples = all_allocations[sb].second; + PcSamplingRecordT* samples = all_allocations[sb].first; + size_t num_samples = all_allocations[sb].second; EXPECT_EQ(num_samples, NUM_QUEUES); // QueueHammer: Incorrect number of samples EXPECT_EQ(check_samples(samples, num_samples), @@ -314,12 +347,25 @@ TEST(pcs_parser, queue_hammer) } } -TEST(pcs_parser, multi_buffer) +/** + * Hammers the parser by creating and destrying queues at random, adding dispatches at random + * and generating PC samples at random. By default we use all 4 unique doorbells, + * queue size is 16 and we generate 10k samples dispatch. + */ +TEST(pcs_parser, queue_hammer) { - std::shared_ptr firstBuffer = std::make_shared(); - std::shared_ptr queue = std::make_shared(16, firstBuffer); - std::shared_ptr dispatch1 = std::make_shared(queue); - std::shared_ptr dispatch2 = std::make_shared(queue); + pcs_parser_queue_hammer(); + pcs_parser_queue_hammer(); +} + +template +void +pcs_parser_multi_buffer() +{ + auto firstBuffer = std::make_shared>(); + auto queue = std::make_shared>(16, firstBuffer); + auto dispatch1 = std::make_shared>(queue); + auto dispatch2 = std::make_shared>(queue); firstBuffer->genUpcomingSamples(4); MockWave(dispatch1).genPCSample(); @@ -327,21 +373,21 @@ TEST(pcs_parser, multi_buffer) MockWave(dispatch1).genPCSample(); MockWave(dispatch2).genPCSample(); - std::shared_ptr secondBuffer = std::make_shared(); - const auto& packets = firstBuffer->packets; - secondBuffer->packets = std::vector(packets.begin() + 2, packets.end()); + auto secondBuffer = std::make_shared>(); + const auto& packets = firstBuffer->packets; + secondBuffer->packets = std::vector(packets.begin() + 2, packets.end()); - std::vector> all_allocations; + std::vector> all_allocations; CHECK_PARSER(parse_buffer((generic_sample_t*) firstBuffer->packets.data(), firstBuffer->packets.size(), GFXIP_MAJOR, - alloc_callback, + alloc_callback, (void*) &all_allocations)); CHECK_PARSER(parse_buffer((generic_sample_t*) secondBuffer->packets.data(), secondBuffer->packets.size(), GFXIP_MAJOR, - alloc_callback, + alloc_callback, (void*) &all_allocations)); EXPECT_EQ(all_allocations.size(), 2); // MultiBuffer: Incorrect number of callbacks @@ -352,4 +398,10 @@ TEST(pcs_parser, multi_buffer) delete[] all_allocations[0].first; delete[] all_allocations[1].first; -}; +} + +TEST(pcs_parser, multi_buffer) +{ + pcs_parser_multi_buffer(); + pcs_parser_multi_buffer(); +} diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/gfx9test.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/gfx9test.cpp index 755c929632..2bf1f95f81 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/gfx9test.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/gfx9test.cpp @@ -100,14 +100,15 @@ ARBCHECK2(ISSUE_EXP); \ ARBCHECK2(ISSUE_MISC); +template class WaveSnapTest { public: WaveSnapTest() { - buffer = std::make_shared(); - queue = std::make_shared(16, buffer); - dispatch = std::make_shared(queue); + buffer = std::make_shared>(); + queue = std::make_shared>(16, buffer); + dispatch = std::make_shared>(queue); } void Test() @@ -140,25 +141,27 @@ public: dispatch->submit(packet_union_t{.snap = snap}); }; - std::shared_ptr buffer; - std::shared_ptr queue; - std::shared_ptr dispatch; + std::shared_ptr> buffer; + std::shared_ptr> queue; + std::shared_ptr> dispatch; }; -class WaveCntTest : public WaveSnapTest +template +class WaveCntTest : public WaveSnapTest { public: void FillBuffers() override { // Loop over all possible wave_cnt - buffer->genUpcomingSamples(max_wave_number); + this->buffer->genUpcomingSamples(max_wave_number); for(size_t i = 0; i < max_wave_number; i++) - genPCSample(i, GFX9::TYPE_LDS, GFX9::REASON_ALU, GFX9::ISSUE_VALU, GFX9::ISSUE_VALU); + this->genPCSample( + i, GFX9::TYPE_LDS, GFX9::REASON_ALU, GFX9::ISSUE_VALU, GFX9::ISSUE_VALU); } void CheckBuffers() override { - auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9 + auto parsed = this->buffer->get_parsed_buffer(9); // GFXIP==9 assert(parsed.size() == 1); assert(parsed[0].size() == max_wave_number); @@ -166,204 +169,336 @@ public: assert(parsed[0][i].wave_count == i); } - const size_t max_wave_number = 64; - std::vector snapshots; + const size_t max_wave_number = 64; + std::vector snapshots; }; -class InstTypeTest : public WaveSnapTest +// class InstTypeTest : public WaveSnapTest +// { +// public: +// void FillBuffers() override +// { +// // Loop over inst_type_issued +// UNROLL_TYPECHECK(); +// buffer->genUpcomingSamples(GFX9::TYPE_LAST); +// for(int i = 0; i < GFX9::TYPE_LAST; i++) +// genPCSample(i, i, GFX9::REASON_ALU, GFX9::ISSUE_MATRIX, GFX9::ISSUE_MATRIX); +// } + +// void CheckBuffers() override +// { +// auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9 +// assert(parsed.size() == 1); +// assert(parsed[0].size() == GFX9::TYPE_LAST); +// assert(snapshots.size() == GFX9::TYPE_LAST); + +// for(size_t i = 0; i < GFX9::TYPE_LAST; i++) +// assert(snapshots[i].inst_type == parsed[0][i].snapshot.inst_type); +// } + +// std::vector snapshots; +// }; + +// class StallReasonTest : public WaveSnapTest +// { +// public: +// void FillBuffers() override +// { +// // Loop over reason_not_issued +// UNROLL_REASONCHECK(); +// buffer->genUpcomingSamples(GFX9::REASON_LAST); +// for(int i = 0; i < GFX9::REASON_LAST; i++) +// genPCSample(i, GFX9::TYPE_MATRIX, i, GFX9::ISSUE_MATRIX, GFX9::ISSUE_MATRIX); +// } + +// void CheckBuffers() override +// { +// auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9 +// assert(parsed.size() == 1); +// assert(parsed[0].size() == GFX9::REASON_LAST); +// assert(snapshots.size() == GFX9::REASON_LAST); + +// for(size_t i = 0; i < GFX9::REASON_LAST; i++) +// assert(snapshots[i].reason_not_issued == parsed[0][i].snapshot.reason_not_issued); +// } + +// std::vector snapshots; +// }; + +// class ArbStateTest : public WaveSnapTest +// { +// public: +// void FillBuffers() override +// { +// // Loop over arb_state_issue +// UNROLL_ARBCHECK(); +// buffer->genUpcomingSamples(GFX9::ISSUE_LAST * GFX9::ISSUE_LAST); +// for(int i = 0; i < GFX9::ISSUE_LAST; i++) +// for(int j = 0; j < GFX9::ISSUE_LAST; j++) +// genPCSample(i, GFX9::TYPE_MATRIX, GFX9::REASON_ALU, 1 << i, 1 << j); +// } + +// void CheckBuffers() override +// { +// auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9 +// assert(parsed.size() == 1); +// assert(parsed[0].size() == GFX9::ISSUE_LAST * GFX9::ISSUE_LAST); +// assert(snapshots.size() == GFX9::ISSUE_LAST * GFX9::ISSUE_LAST); + +// for(size_t i = 0; i < GFX9::ISSUE_LAST * GFX9::ISSUE_LAST; i++) +// { +// auto& snap = snapshots[i]; +// assert(snap.arb_state_issue == parsed[0][i].snapshot.arb_state_issue); +// assert(snap.arb_state_stall == parsed[0][i].snapshot.arb_state_stall); +// } +// } + +// std::vector snapshots; +// }; + +// class WaveIssueAndErrorTest : public WaveSnapTest +// { +// void FillBuffers() override +// { +// buffer->genUpcomingSamples(16); +// for(int valid = 0; valid <= 1; valid++) +// for(int issued = 0; issued <= 1; issued++) +// for(int dual = 0; dual <= 1; dual++) +// for(int error = 0; error <= 1; error++) +// genPCSample(valid, issued, dual, error); +// } + +// void CheckBuffers() override +// { +// const int num_combinations = 16; +// auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9 +// assert(parsed.size() == 1); +// assert(parsed[0].size() == num_combinations); +// assert(compare.size() == num_combinations); + +// for(size_t i = 0; i < num_combinations; i++) +// { +// assert(compare[i].flags.valid == parsed[0][i].flags.valid); +// assert(compare[i].wave_issued == parsed[0][i].wave_issued); +// assert(compare[i].snapshot.dual_issue_valu == parsed[0][i].snapshot.dual_issue_valu); +// } +// } + +// union trap_snapshot_v1 +// { +// struct +// { +// uint32_t valid : 1; +// uint32_t issued : 1; +// uint32_t dual : 1; +// uint32_t reserved : 23; +// uint32_t error : 1; +// uint32_t reserved2 : 5; +// }; +// uint32_t raw; +// }; + +// void genPCSample(bool valid, bool issued, bool dual, bool error) +// { +// rocprofiler_pc_sampling_record_t sample; +// ::memset(&sample, 0, sizeof(sample)); +// // TODO: Since code objects are not mocked, use pc.code_object_offset +// // as the absolute physical address of the mocked PC. +// sample.pc.code_object_offset = dispatch->unique_id; + +// sample.correlation_id.internal = dispatch->getMockId().raw; + +// sample.flags.valid = valid && !error; +// sample.wave_issued = issued; +// sample.snapshot.dual_issue_valu = dual; + +// assert(dispatch.get()); + +// compare.push_back(sample); + +// trap_snapshot_v1 snap; +// snap.valid = valid; +// snap.issued = issued; +// snap.dual = dual; +// snap.error = error; + +// perf_sample_snapshot_v1 pss; +// pss.perf_snapshot_data = snap.raw; +// pss.correlation_id = dispatch->getMockId().raw; +// dispatch->submit(std::move(pss)); +// }; + +// std::vector compare; +// }; + +template +class HwIdTest : public WaveSnapTest { -public: - void FillBuffers() override - { - // Loop over inst_type_issued - UNROLL_TYPECHECK(); - buffer->genUpcomingSamples(GFX9::TYPE_LAST); - for(int i = 0; i < GFX9::TYPE_LAST; i++) - genPCSample(i, i, GFX9::REASON_ALU, GFX9::ISSUE_MATRIX, GFX9::ISSUE_MATRIX); - } - - void CheckBuffers() override - { - auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9 - assert(parsed.size() == 1); - assert(parsed[0].size() == GFX9::TYPE_LAST); - assert(snapshots.size() == GFX9::TYPE_LAST); - - for(size_t i = 0; i < GFX9::TYPE_LAST; i++) - assert(snapshots[i].inst_type == parsed[0][i].snapshot.inst_type); - } - - std::vector snapshots; -}; - -class StallReasonTest : public WaveSnapTest -{ -public: - void FillBuffers() override - { - // Loop over reason_not_issued - UNROLL_REASONCHECK(); - buffer->genUpcomingSamples(GFX9::REASON_LAST); - for(int i = 0; i < GFX9::REASON_LAST; i++) - genPCSample(i, GFX9::TYPE_MATRIX, i, GFX9::ISSUE_MATRIX, GFX9::ISSUE_MATRIX); - } - - void CheckBuffers() override - { - auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9 - assert(parsed.size() == 1); - assert(parsed[0].size() == GFX9::REASON_LAST); - assert(snapshots.size() == GFX9::REASON_LAST); - - for(size_t i = 0; i < GFX9::REASON_LAST; i++) - assert(snapshots[i].reason_not_issued == parsed[0][i].snapshot.reason_not_issued); - } - - std::vector snapshots; -}; - -class ArbStateTest : public WaveSnapTest -{ -public: - void FillBuffers() override - { - // Loop over arb_state_issue - UNROLL_ARBCHECK(); - buffer->genUpcomingSamples(GFX9::ISSUE_LAST * GFX9::ISSUE_LAST); - for(int i = 0; i < GFX9::ISSUE_LAST; i++) - for(int j = 0; j < GFX9::ISSUE_LAST; j++) - genPCSample(i, GFX9::TYPE_MATRIX, GFX9::REASON_ALU, 1 << i, 1 << j); - } - - void CheckBuffers() override - { - auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9 - assert(parsed.size() == 1); - assert(parsed[0].size() == GFX9::ISSUE_LAST * GFX9::ISSUE_LAST); - assert(snapshots.size() == GFX9::ISSUE_LAST * GFX9::ISSUE_LAST); - - for(size_t i = 0; i < GFX9::ISSUE_LAST * GFX9::ISSUE_LAST; i++) - { - auto& snap = snapshots[i]; - assert(snap.arb_state_issue == parsed[0][i].snapshot.arb_state_issue); - assert(snap.arb_state_stall == parsed[0][i].snapshot.arb_state_stall); - } - } - - std::vector snapshots; -}; - -class WaveIssueAndErrorTest : public WaveSnapTest -{ - void FillBuffers() override - { - buffer->genUpcomingSamples(16); - for(int valid = 0; valid <= 1; valid++) - for(int issued = 0; issued <= 1; issued++) - for(int dual = 0; dual <= 1; dual++) - for(int error = 0; error <= 1; error++) - genPCSample(valid, issued, dual, error); - } - - void CheckBuffers() override - { - const int num_combinations = 16; - auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9 - assert(parsed.size() == 1); - assert(parsed[0].size() == num_combinations); - assert(compare.size() == num_combinations); - - for(size_t i = 0; i < num_combinations; i++) - { - assert(compare[i].flags.valid == parsed[0][i].flags.valid); - assert(compare[i].wave_issued == parsed[0][i].wave_issued); - assert(compare[i].snapshot.dual_issue_valu == parsed[0][i].snapshot.dual_issue_valu); - } - } - - union trap_snapshot_v1 + union gfx9_hw_id_t { + uint32_t raw; struct { - uint32_t valid : 1; - uint32_t issued : 1; - uint32_t dual : 1; - uint32_t reserved : 23; - uint32_t error : 1; - uint32_t reserved2 : 5; + uint32_t wave_id : 4; ///< wave slot index + uint32_t simd_id : 2; ///< SIMD index + uint32_t pipe_id : 2; ///< pipe index + uint32_t cu_id : 4; ///< Index of compute unit on GFX9 or workgroup processer on other + ///< architectures + uint32_t shader_array_id : 1; ///< Shared array index + uint32_t shader_engine_id : 3; ///< shared engine index + uint32_t + threadgroup_id : 4; ///< thread_group index on GFX9, and workgroup index on GFX10+ + uint32_t vm_id : 4; ///< virtual memory ID + uint32_t queue_id : 3; ///< queue id + uint32_t gfx_context_state_id : 3; ///< GFX context (state) id (only on GFX9) - ignored + uint32_t microengine_id : 2; ///< ACE (microengine) index }; - uint32_t raw; }; - void genPCSample(bool valid, bool issued, bool dual, bool error) - { - rocprofiler_pc_sampling_record_t sample; - ::memset(&sample, 0, sizeof(sample)); - // TODO: Since code objects are not mocked, use pc.loaded_code_object_offset - // as the absolute physical address of the mocked PC. - sample.pc.loaded_code_object_offset = dispatch->unique_id; - - sample.correlation_id.internal = dispatch->getMockId().raw; - - sample.flags.valid = valid && !error; - sample.wave_issued = issued; - sample.snapshot.dual_issue_valu = dual; - - assert(dispatch.get()); - - compare.push_back(sample); - - trap_snapshot_v1 snap; - snap.valid = valid; - snap.issued = issued; - snap.dual = dual; - snap.error = error; - - perf_sample_snapshot_v1 pss; - pss.perf_snapshot_data = snap.raw; - pss.correlation_id = dispatch->getMockId().raw; - dispatch->submit(std::move(pss)); - }; - - std::vector compare; -}; - -class WaveOtherFieldsTest : public WaveSnapTest -{ void FillBuffers() override { - buffer->genUpcomingSamples(3); - genPCSample(1, 2, 3, 4, 5, 6, 7, 8); // Counting - genPCSample(3, 5, 7, 11, 13, 17, 19, 23); // Some prime numbers - genPCSample(23, 19, 17, 13, 11, 7, 5, 3); // Some reversed primes + gfx9_hw_id_t hw_id_val0; + hw_id_val0.wave_id = 0; + hw_id_val0.simd_id = 0; + hw_id_val0.pipe_id = 0; + hw_id_val0.cu_id = 0; + hw_id_val0.shader_array_id = 0; + hw_id_val0.shader_engine_id = 0; + hw_id_val0.threadgroup_id = 0; + hw_id_val0.vm_id = 0; + hw_id_val0.queue_id = 0; + hw_id_val0.gfx_context_state_id = 0; + hw_id_val0.microengine_id = 0; + + gfx9_hw_id_t hw_id_val1; + hw_id_val0.wave_id = 15; + hw_id_val0.simd_id = 3; + hw_id_val0.pipe_id = 3; + hw_id_val0.cu_id = 15; + hw_id_val0.shader_array_id = 1; + hw_id_val0.shader_engine_id = 7; + hw_id_val0.threadgroup_id = 15; + hw_id_val0.vm_id = 15; + hw_id_val0.queue_id = 7; + hw_id_val0.gfx_context_state_id = 7; + hw_id_val0.microengine_id = 3; + + gfx9_hw_id_t hw_id_val2; + hw_id_val2.wave_id = 7; + hw_id_val2.simd_id = 2; + hw_id_val2.pipe_id = 2; + hw_id_val2.cu_id = 6; + hw_id_val2.shader_array_id = 0; + hw_id_val2.shader_engine_id = 3; + hw_id_val2.threadgroup_id = 8; + hw_id_val2.vm_id = 9; + hw_id_val2.queue_id = 3; + hw_id_val2.gfx_context_state_id = 2; + hw_id_val2.microengine_id = 1; + + this->buffer->genUpcomingSamples(3); + genPCSample(hw_id_val0); + genPCSample(hw_id_val1); + genPCSample(hw_id_val2); } void CheckBuffers() override { - auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9 + auto parsed = this->buffer->get_parsed_buffer(9); // GFXIP==9 assert(parsed.size() == 1); assert(parsed[0].size() == 3); assert(compare.size() == 3); for(size_t i = 0; i < 3; i++) { - assert(parsed[0][i].flags.has_stall_reason == true); - assert(parsed[0][i].flags.has_wave_cnt == true); - assert(parsed[0][i].flags.reserved == false); + // Comparing individual fields + assert(compare[i].hw_id.wave_id == parsed[0][i].hw_id.wave_id); + assert(compare[i].hw_id.simd_id == parsed[0][i].hw_id.simd_id); + assert(compare[i].hw_id.pipe_id == parsed[0][i].hw_id.pipe_id); + assert(compare[i].hw_id.cu_or_wgp_id == parsed[0][i].hw_id.cu_or_wgp_id); + assert(compare[i].hw_id.shader_array_id == parsed[0][i].hw_id.shader_array_id); + assert(compare[i].hw_id.shader_engine_id == parsed[0][i].hw_id.shader_engine_id); + assert(compare[i].hw_id.workgroup_id == parsed[0][i].hw_id.workgroup_id); + assert(compare[i].hw_id.vm_id == parsed[0][i].hw_id.vm_id); + assert(compare[i].hw_id.queue_id == parsed[0][i].hw_id.queue_id); + assert(compare[i].hw_id.microengine_id == parsed[0][i].hw_id.microengine_id); + } + } + + void genPCSample(gfx9_hw_id_t hw_id) + { + PcSamplingRecordT sample; + ::memset(&sample, 0, sizeof(sample)); + // Unpacking individual fields + // NOTE: chiplet is tested in a WaveOtherFieldsTest test, becuase it's not + // transferred via hw_id, but chiplet_and_wave_id field. + sample.hw_id.wave_id = hw_id.wave_id; + sample.hw_id.simd_id = hw_id.simd_id; + sample.hw_id.pipe_id = hw_id.pipe_id; + sample.hw_id.cu_or_wgp_id = hw_id.cu_id; + sample.hw_id.shader_array_id = hw_id.shader_array_id; + sample.hw_id.shader_engine_id = hw_id.shader_engine_id; + sample.hw_id.workgroup_id = hw_id.threadgroup_id; + sample.hw_id.vm_id = hw_id.vm_id; + sample.hw_id.queue_id = hw_id.queue_id; + sample.hw_id.microengine_id = hw_id.microengine_id; + + compare.push_back(sample); + + perf_sample_snapshot_v1 snap; + ::memset(&snap, 0, sizeof(snap)); + + // raw register value + snap.hw_id = hw_id.raw; + snap.correlation_id = this->dispatch->getMockId().raw; + + assert(this->dispatch.get()); + this->dispatch->submit(snap); + }; + + std::vector compare; +}; + +template +class WaveOtherFieldsTest : public WaveSnapTest +{ + void FillBuffers() override + { + this->buffer->genUpcomingSamples(3); + genPCSample(1, 2, 3, 4, 5, 6, 7); // Counting + genPCSample(3, 5, 7, 11, 13, 17, 19); // Some prime numbers + genPCSample(23, 19, 17, 13, 11, 7, 5); // Some reversed primes + } + + void CheckBuffers() override + { + auto parsed = this->buffer->get_parsed_buffer(9); // GFXIP==9 + assert(parsed.size() == 1); + assert(parsed[0].size() == 3); + assert(compare.size() == 3); + + for(size_t i = 0; i < 3; i++) + { + // TODO: if we decide to test flags, make specialization for + // rocprofiler_pc_sampling_record_stochastic_v0_t + // assert(parsed[0][i].flags.has_stall_reason == true); + // assert(parsed[0][i].flags.has_wave_cnt == true); + // assert(parsed[0][i].flags.reserved == false); assert(compare[i].exec_mask == parsed[0][i].exec_mask); assert(compare[i].workgroup_id == parsed[0][i].workgroup_id); - assert(compare[i].chiplet == parsed[0][i].chiplet); - assert(compare[i].wave_id == parsed[0][i].wave_id); - assert(compare[i].hw_id == parsed[0][i].hw_id); + assert(compare[i].hw_id.chiplet == parsed[0][i].hw_id.chiplet); + assert(compare[i].wave_in_group == parsed[0][i].wave_in_group); + // TODO: handle HW_ID as well. + // assert(compare[i].hw_id == parsed[0][i].hw_id); assert(compare[i].correlation_id.internal == parsed[0][i].correlation_id.internal); } } - void genPCSample(int pc, int exec, int blkx, int blky, int blkz, int chip, int wave, int hwid) + void genPCSample(int pc, int exec, int blkx, int blky, int blkz, int chip, int wave) { - rocprofiler_pc_sampling_record_t sample; + PcSamplingRecordT sample; ::memset(&sample, 0, sizeof(sample)); sample.exec_mask = exec; @@ -371,13 +506,15 @@ class WaveOtherFieldsTest : public WaveSnapTest sample.workgroup_id.y = blky; sample.workgroup_id.z = blkz; - sample.chiplet = chip; - sample.wave_id = wave; - sample.hw_id = hwid; - sample.correlation_id.internal = dispatch->unique_id; + sample.hw_id.chiplet = chip; + sample.wave_in_group = wave; + sample.correlation_id.internal = this->dispatch->unique_id; compare.push_back(sample); + // We're testing fields commong for both perf_sample_host_trap_v1 and + // perf_sample_snapshot_v1, so either struct is suitable here. No need to make + // specialization, perf_sample_snapshot_v1 snap; ::memset(&snap, 0, sizeof(snap)); snap.exec_mask = exec; @@ -386,26 +523,31 @@ class WaveOtherFieldsTest : public WaveSnapTest snap.workgroup_id_y = blky; snap.workgroup_id_z = blkz; snap.chiplet_and_wave_id = (chip << 8) | (wave & 0x3F); - snap.hw_id = hwid; - snap.correlation_id = dispatch->getMockId().raw; + snap.correlation_id = this->dispatch->getMockId().raw; - assert(dispatch.get()); - dispatch->submit(snap); + assert(this->dispatch.get()); + this->dispatch->submit(snap); (void) pc; }; - std::vector compare; + std::vector compare; }; TEST(pcs_parser, gfx9_test) { - WaveCntTest{}.Test(); - InstTypeTest{}.Test(); - StallReasonTest{}.Test(); - ArbStateTest{}.Test(); - WaveIssueAndErrorTest{}.Test(); - WaveOtherFieldsTest{}.Test(); + // Tests specific to stochastic sampling only + WaveCntTest{}.Test(); + // InstTypeTest{}.Test(); + // StallReasonTest{}.Test(); + // ArbStateTest{}.Test(); + // WaveIssueAndErrorTest{}.Test(); + + // Tests commong for both host trap and stochastic sampling. + HwIdTest{}.Test(); + HwIdTest{}.Test(); + WaveOtherFieldsTest{}.Test(); + WaveOtherFieldsTest{}.Test(); std::cout << "GFX9 Test Done." << std::endl; } diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/mocks.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/mocks.hpp index ce56b2ac2a..ef8ad7c62c 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/mocks.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/mocks.hpp @@ -46,6 +46,7 @@ /** * Mimics the rocprofiler buffer sent to the parser. */ +template class MockRuntimeBuffer { public: @@ -59,18 +60,21 @@ public: void submit(const packet_union_t& packet) { packets.push_back(packet); }; //! Submits a "upcoming_samples_t" packet signaling the next num_samples packets are PC samples - void genUpcomingSamples(int num_samples) + void genUpcomingSamples(int num_samples, upcoming_sample_t sample_type) { packet_union_t uni; ::memset(&uni, 0, sizeof(uni)); uni.upcoming.type = AMD_UPCOMING_SAMPLES; - uni.upcoming.which_sample_type = AMD_SNAPSHOT_V1; + uni.upcoming.which_sample_type = sample_type; uni.upcoming.num_samples = num_samples; uni.upcoming.device.handle = device; submit(uni); } - std::vector> get_parsed_buffer(int GFXIP_MAJOR) + //! Submits a "upcoming_samples_t" packet signaling the next num_samples packets are PC samples + void genUpcomingSamples(int num_samples); + + std::vector> get_parsed_buffer(int GFXIP_MAJOR) { parsed_data = {}; @@ -83,22 +87,38 @@ public: return parsed_data; } - static uint64_t alloc_parse_memory(rocprofiler_pc_sampling_record_t** sample, - uint64_t req_size, - void* userdata) + static uint64_t alloc_parse_memory(PcSamplingRecordT** sample, + uint64_t req_size, + void* userdata) { auto* buffer = reinterpret_cast(userdata); - buffer->parsed_data.push_back(std::vector(req_size)); + buffer->parsed_data.push_back(std::vector(req_size)); *sample = buffer->parsed_data.back().data(); return req_size; } - std::vector packets; - std::vector> parsed_data; + std::vector packets; + std::vector> parsed_data; const uint32_t device; }; +template <> +void +MockRuntimeBuffer::genUpcomingSamples( + int num_samples) +{ + genUpcomingSamples(num_samples, AMD_HOST_TRAP_V1); +} + +template <> +void +MockRuntimeBuffer::genUpcomingSamples( + int num_samples) +{ + this->genUpcomingSamples(num_samples, AMD_SNAPSHOT_V1); +} + /** * Mimics a HSA doorbell. Every live instance of this class has an unique ID (handler). * The handler itself may be not unique considering dead instances. @@ -149,10 +169,11 @@ private: * read and write pointers. * Creating an instance of this class automatically adds a queue creation packet to the buffer. */ +template class MockQueue { public: - MockQueue(int size_, std::shared_ptr& buffer_) + MockQueue(int size_, std::shared_ptr>& buffer_) : id(getUniqueId()) , size(size_) , doorbell() @@ -184,7 +205,7 @@ public: const MockDoorBell doorbell; const uint32_t device; - std::shared_ptr const buffer; + std::shared_ptr> const buffer; private: static size_t getUniqueId() @@ -198,10 +219,11 @@ private: * Mimics a kernel dispatch. * Creating an instance of this class automatically adds a dispatch creation packet to the buffer. */ +template class MockDispatch { public: - MockDispatch(std::shared_ptr& queue_) + MockDispatch(std::shared_ptr>& queue_) : queue(queue_) , dispatch_id(queue->write_index) , doorbell_id(queue->doorbell.handler) @@ -251,7 +273,7 @@ public: << " ds_id:" << dispatch_id << std::endl; } - std::shared_ptr const queue; + std::shared_ptr> const queue; const size_t dispatch_id; const size_t doorbell_id; @@ -273,10 +295,11 @@ private: * Instead of generating a valid program counter, this class uses the snapshot.pc field to * store the original dispatch's unique_id for later correctness verification. */ +template class MockWave { public: - MockWave(const std::shared_ptr& dispatch_) + MockWave(const std::shared_ptr>& dispatch_) : dispatch(dispatch_) {} @@ -295,5 +318,5 @@ public: << dispatch->unique_id << std::endl; } - std::shared_ptr const dispatch; + std::shared_ptr> const dispatch; }; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/multigpu.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/multigpu.cpp index 21b50b66b0..3bd088d694 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/multigpu.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/multigpu.cpp @@ -46,19 +46,20 @@ public: /** * Sample user memory allocation callback. * It expects userdata to be cast-able to a pointer to - * std::vector> + * std::vector> */ +template static uint64_t -alloc_callback(rocprofiler_pc_sampling_record_t** buffer, uint64_t size, void* userdata) +alloc_callback(PcSamplingRecordT** buffer, uint64_t size, void* userdata) { - *buffer = new rocprofiler_pc_sampling_record_t[size]; + *buffer = new PcSamplingRecordT[size]; auto& vector = - *reinterpret_cast>*>( - userdata); + *reinterpret_cast>*>(userdata); vector.push_back({*buffer, size}); return size; } +template void multithread_queue_hammer(size_t tid, Latch* latch) { @@ -70,10 +71,11 @@ multithread_queue_hammer(size_t tid, Latch* latch) constexpr int NUM_QUEUES = MockDoorBell::num_unique_bells / NUM_THREADS; constexpr int ACTION_MAX = QSIZE * NUM_QUEUES / 2; - std::shared_ptr buffer = std::make_shared(tid); + auto buffer = std::make_shared>(tid); - std::array, NUM_QUEUES> queues; - std::array>, NUM_QUEUES> active_dispatches; + std::array>, NUM_QUEUES> queues; + std::array>>, NUM_QUEUES> + active_dispatches; int num_reset_queues = 0; int num_samples_generated = 0; @@ -82,9 +84,10 @@ multithread_queue_hammer(size_t tid, Latch* latch) size_t max_q_occupancy = 0; for(int i = 0; i < NUM_QUEUES; i++) - queues[i] = std::make_shared(QSIZE, buffer); + queues[i] = std::make_shared>(QSIZE, buffer); for(int i = 0; i < NUM_QUEUES; i++) - active_dispatches[i].push_back(std::make_shared(queues[i])); + active_dispatches[i].push_back( + std::make_shared>(queues[i])); for(int i = 0; i < NUM_ACTIONS; i++) { @@ -95,7 +98,7 @@ multithread_queue_hammer(size_t tid, Latch* latch) // Delete queue and create new one active_dispatches[q] = {}; queues[q].reset(); - queues[q] = std::make_shared(QSIZE, buffer); + queues[q] = std::make_shared>(QSIZE, buffer); num_reset_queues++; } else if(action > ACTION_MAX / 2 && active_dispatches[q].size() > 1) @@ -108,7 +111,8 @@ multithread_queue_hammer(size_t tid, Latch* latch) // Add new dispatch if(active_dispatches[q].size() < QSIZE) { - active_dispatches[q].push_back(std::make_shared(queues[q])); + active_dispatches[q].push_back( + std::make_shared>(queues[q])); num_dispatches_generated += 1; } @@ -117,7 +121,8 @@ multithread_queue_hammer(size_t tid, Latch* latch) for(auto& queue : active_dispatches) { EXPECT_NE(queue.size(), 0); - std::shared_ptr rand_dispatch = queue[rdgen() % queue.size()]; + std::shared_ptr> rand_dispatch = + queue[rdgen() % queue.size()]; MockWave(rand_dispatch).genPCSample(); num_samples_generated += 1; avg_q_occupancy += queue.size(); @@ -127,23 +132,23 @@ multithread_queue_hammer(size_t tid, Latch* latch) latch->sync(); - std::vector> all_allocations; + std::vector> all_allocations; CHECK_PARSER(_parse_buffer((generic_sample_t*) buffer->packets.data(), buffer->packets.size(), - alloc_callback, + alloc_callback, (void*) &all_allocations, &corr_map)); EXPECT_EQ(all_allocations.size(), NUM_ACTIONS); // Incorrect number of callbacks for(auto sb = 0ul; sb < all_allocations.size(); sb++) { - rocprofiler_pc_sampling_record_t* samples = all_allocations[sb].first; - size_t num_samples = all_allocations[sb].second; + PcSamplingRecordT* samples = all_allocations[sb].first; + size_t num_samples = all_allocations[sb].second; EXPECT_EQ(num_samples, NUM_QUEUES); for(size_t i = 0; i < num_samples; i++) - EXPECT_EQ(samples[i].correlation_id.internal, samples[i].pc.loaded_code_object_offset); + EXPECT_EQ(samples[i].correlation_id.internal, samples[i].pc.code_object_offset); delete[] samples; } } @@ -152,6 +157,7 @@ multithread_queue_hammer(size_t tid, Latch* latch) * Benchmarks how fast the parser can process samples on a single threaded case * Current: 5600X with -Ofast, up to >140 million samples/s or ~9GB/s R/W (18GB/s bidirectional) */ +template static std::pair MultiThread_BenchMark(size_t tid, Latch* latch) { @@ -161,14 +167,16 @@ MultiThread_BenchMark(size_t tid, Latch* latch) constexpr size_t DISP_PER_QUEUE = 16; constexpr size_t NUM_QUEUES = 1; - std::shared_ptr buffer = std::make_shared(tid); - std::array>, NUM_QUEUES> active_dispatches; + auto buffer = std::make_shared>(tid); + std::array>>, NUM_QUEUES> + active_dispatches; for(size_t q = 0; q < NUM_QUEUES; q++) { - std::shared_ptr queue = std::make_shared(DISP_PER_QUEUE * 2, buffer); + auto queue = std::make_shared>(DISP_PER_QUEUE * 2, buffer); for(size_t d = 0; d < DISP_PER_QUEUE; d++) - active_dispatches[q].push_back(std::make_shared(queue)); + active_dispatches[q].push_back( + std::make_shared>(queue)); } constexpr size_t TOTAL_NUM_SAMPLES = NUM_QUEUES * DISP_PER_QUEUE * SAMPLE_PER_DISPATCH; @@ -179,29 +187,31 @@ MultiThread_BenchMark(size_t tid, Latch* latch) for(size_t i = 0; i < SAMPLE_PER_DISPATCH; i++) MockWave(dispatch).genPCSample(); - std::pair userdata; - userdata.first = new rocprofiler_pc_sampling_record_t[TOTAL_NUM_SAMPLES]; + std::pair userdata; + userdata.first = new PcSamplingRecordT[TOTAL_NUM_SAMPLES]; userdata.second = TOTAL_NUM_SAMPLES; latch->sync(); - auto t0 = std::chrono::system_clock::now(); - CHECK_PARSER(_parse_buffer( - (generic_sample_t*) buffer->packets.data(), - buffer->packets.size(), - [](rocprofiler_pc_sampling_record_t** sample, uint64_t size, void* userdata_) { - auto* pair = - reinterpret_cast*>(userdata_); - *sample = pair->first; + user_callback_t user_cb = + [](PcSamplingRecordT** sample, uint64_t size, void* userdata_) { + auto* pair = reinterpret_cast*>(userdata_); + *sample = pair->first; return size; - }, - &userdata, - &corr_map)); + }; + + auto t0 = std::chrono::system_clock::now(); + CHECK_PARSER(_parse_buffer((generic_sample_t*) buffer->packets.data(), + buffer->packets.size(), + user_cb, + &userdata, + &corr_map)); auto t1 = std::chrono::system_clock::now(); delete[] userdata.first; return {TOTAL_NUM_SAMPLES, (t1 - t0).count()}; } +template void multithread_codeobj(size_t tid, Latch* latch) { @@ -215,11 +225,11 @@ multithread_codeobj(size_t tid, Latch* latch) constexpr int NUM_SAMPLES = 50; constexpr int QSIZE = 16; - auto buffer = std::make_shared(tid); - auto queue = std::make_shared(QSIZE, buffer); + auto buffer = std::make_shared>(tid); + auto queue = std::make_shared>(QSIZE, buffer); - std::pair userdata; - userdata.first = new rocprofiler_pc_sampling_record_t[NUM_SAMPLES]; + std::pair userdata; + userdata.first = new PcSamplingRecordT[NUM_SAMPLES]; userdata.second = NUM_SAMPLES; latch->sync(); @@ -227,7 +237,7 @@ multithread_codeobj(size_t tid, Latch* latch) for(int d = 0; d < NUM_DISPATCH; d++) { buffer->packets.clear(); - auto dispatch = std::make_shared(queue); + auto dispatch = std::make_shared>(queue); const size_t pc_base_addr = NUM_SAMPLES * dispatch->unique_id; table->insert(addr_range_t{pc_base_addr, NUM_SAMPLES, dispatch->unique_id}); @@ -242,25 +252,25 @@ multithread_codeobj(size_t tid, Latch* latch) dispatch->submit(uni); } - CHECK_PARSER(_parse_buffer( - (generic_sample_t*) buffer->packets.data(), - buffer->packets.size(), - [](rocprofiler_pc_sampling_record_t** sample, uint64_t size, void* userdata_) { - auto* pair = - reinterpret_cast*>( - userdata_); - *sample = pair->first; + user_callback_t user_cb = + [](PcSamplingRecordT** sample, uint64_t size, void* userdata_) { + auto* pair = reinterpret_cast*>(userdata_); + *sample = pair->first; assert(size <= NUM_SAMPLES); return size; - }, - &userdata, - &corr_map)); + }; + + CHECK_PARSER(_parse_buffer((generic_sample_t*) buffer->packets.data(), + buffer->packets.size(), + user_cb, + &userdata, + &corr_map)); for(int s = 0; s < NUM_SAMPLES; s++) { const auto& pc = userdata.first[s].pc; - EXPECT_EQ(pc.loaded_code_object_id, dispatch->unique_id); - EXPECT_EQ(pc.loaded_code_object_offset, s); + EXPECT_EQ(pc.code_object_id, dispatch->unique_id); + EXPECT_EQ(pc.code_object_offset, s); } table->remove(addr_range_t{pc_base_addr, NUM_SAMPLES, dispatch->unique_id}); @@ -269,7 +279,9 @@ multithread_codeobj(size_t tid, Latch* latch) delete[] userdata.first; } -TEST(pcs_parser, bench_test) +template +void +pcs_parser_bench_test() { size_t time = 0; size_t samples = 0; @@ -280,7 +292,8 @@ TEST(pcs_parser, bench_test) std::vector>> threads{}; for(size_t t = 0; t < NUM_THREADS; t++) - threads.push_back(std::async(std::launch::async, MultiThread_BenchMark, t, &latch)); + threads.push_back(std::async( + std::launch::async, MultiThread_BenchMark, t, &latch)); if(it == 0) continue; // Skip warmup @@ -295,23 +308,47 @@ TEST(pcs_parser, bench_test) double mean = 1E3 * NUM_THREADS * samples / time; std::cout << "Benchmark: Parsed " << int(mean * 1E3 + 0.5) * 1E-3f << " Msample/s ("; - std::cout << int(sizeof(rocprofiler_pc_sampling_record_t) * mean) << " MB/s)" << std::endl; + std::cout << int(sizeof(PcSamplingRecordT) * mean) << " MB/s)" << std::endl; +}; + +TEST(pcs_parser, bench_test) +{ + pcs_parser_bench_test(); + pcs_parser_bench_test(); +} + +template +void +pcs_parser_hammer_test() +{ + Latch latch(NUM_THREADS); + + std::vector> threads{}; + for(size_t i = 0; i < NUM_THREADS; i++) + threads.push_back( + std::async(std::launch::async, multithread_queue_hammer, i, &latch)); }; TEST(pcs_parser, hammer_test) { - Latch latch(NUM_THREADS); + pcs_parser_hammer_test(); + pcs_parser_hammer_test(); +} - std::vector> threads{}; - for(size_t i = 0; i < NUM_THREADS; i++) - threads.push_back(std::async(std::launch::async, multithread_queue_hammer, i, &latch)); -}; - -TEST(pcs_parser, codeobj_test) +template +void +pcs_parser_codeobj_test() { Latch latch(NUM_THREADS); std::vector> threads{}; for(size_t i = 0; i < NUM_THREADS; i++) - threads.push_back(std::async(std::launch::async, multithread_codeobj, i, &latch)); -}; + threads.push_back( + std::async(std::launch::async, multithread_codeobj, i, &latch)); +} + +TEST(pcs_parser, codeobj_test) +{ + pcs_parser_codeobj_test(); + pcs_parser_codeobj_test(); +} diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/translation.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/translation.hpp index 105f5c5583..2a942c4e24 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/translation.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/translation.hpp @@ -28,184 +28,284 @@ #include "lib/rocprofiler-sdk/pc_sampling/parser/gfx11.hpp" #include "lib/rocprofiler-sdk/pc_sampling/parser/gfx9.hpp" -#include "lib/rocprofiler-sdk/pc_sampling/parser/parser_types.h" +#include "lib/rocprofiler-sdk/pc_sampling/parser/parser_types.hpp" #include "lib/rocprofiler-sdk/pc_sampling/parser/rocr.h" +#include "lib/rocprofiler-sdk/pc_sampling/parser/stochastic_records.h" -template -inline rocprofiler_pc_sampling_record_t +// TODO: refactor the commented code for stochastic sampling + +// template +// inline rocprofiler_pc_sampling_record_t +// copyStochasticSample(const perf_sample_snapshot_v1& sample); + +// template <> +// inline rocprofiler_pc_sampling_record_t +// copyStochasticSample(const perf_sample_snapshot_v1& sample) +// { +// rocprofiler_pc_sampling_record_t ret = copySampleHeader(sample); +// ret.flags.valid = sample.perf_snapshot_data & (~sample.perf_snapshot_data >> 26) & 0x1; +// // Check wave_id matches snapshot_wave_id + +// ret.flags.has_wave_cnt = true; +// ret.flags.has_stall_reason = true; + +// ret.wave_count = sample.perf_snapshot_data1 & 0x3F; + +// ret.wave_issued = sample.perf_snapshot_data >> 1; +// ret.snapshot.dual_issue_valu = sample.perf_snapshot_data >> 2; +// ret.snapshot.inst_type = sample.perf_snapshot_data >> 3; +// ret.snapshot.reason_not_issued = (sample.perf_snapshot_data >> 7) & 0x7; +// ret.snapshot.arb_state_issue = (sample.perf_snapshot_data >> 10) & 0xFF; +// ret.snapshot.arb_state_stall = (sample.perf_snapshot_data >> 18) & 0xFF; +// ret.reserved = 0; +// return ret; +// } + +// template <> +// inline rocprofiler_pc_sampling_record_t +// copyStochasticSample(const perf_sample_snapshot_v1& sample) +// { +// rocprofiler_pc_sampling_record_t ret = copySampleHeader(sample); +// ret.flags.valid = sample.perf_snapshot_data & (~sample.perf_snapshot_data >> 23) & 0x1; +// // Check wave_id matches snapshot_wave_id + +// ret.flags.has_stall_reason = true; + +// ret.wave_issued = sample.perf_snapshot_data >> 1; +// ret.snapshot.inst_type = sample.perf_snapshot_data >> 2; +// ret.snapshot.reason_not_issued = (sample.perf_snapshot_data >> 6) & 0x7; +// ret.snapshot.arb_state_issue = (sample.perf_snapshot_data >> 9) & 0x7F; +// ret.snapshot.arb_state_stall = (sample.perf_snapshot_data >> 16) & 0x7F; +// ret.snapshot.dual_issue_valu = false; +// ret.reserved = 0; +// return ret; +// } + +// #define BITSHIFT(sname) out |= ((in >> GFX::sname) & 1) << PCSAMPLE::sname + +// template +// inline int +// translate_arb(int in) +// { +// size_t out = 0; +// BITSHIFT(ISSUE_VALU); +// BITSHIFT(ISSUE_MATRIX); +// BITSHIFT(ISSUE_LDS); +// BITSHIFT(ISSUE_LDS_DIRECT); +// BITSHIFT(ISSUE_SCALAR); +// BITSHIFT(ISSUE_VMEM_TEX); +// BITSHIFT(ISSUE_FLAT); +// BITSHIFT(ISSUE_EXP); +// BITSHIFT(ISSUE_MISC); +// BITSHIFT(ISSUE_BRMSG); +// return out & 0x3FF; +// } + +// #undef BITSHIFT + +// #define LUTOVERLOAD(sname) this->operator[](GFX::sname) = PCSAMPLE::sname + +// template +// class GFX_REASON_LUT : public std::array +// { +// public: +// GFX_REASON_LUT() +// { +// std::memset(data(), 0, size() * sizeof(int)); +// LUTOVERLOAD(REASON_NOT_AVAILABLE); +// LUTOVERLOAD(REASON_ALU); +// LUTOVERLOAD(REASON_WAITCNT); +// LUTOVERLOAD(REASON_INTERNAL); +// LUTOVERLOAD(REASON_BARRIER); +// LUTOVERLOAD(REASON_ARBITER); +// LUTOVERLOAD(REASON_EX_STALL); +// LUTOVERLOAD(REASON_OTHER_WAIT); +// LUTOVERLOAD(REASON_SLEEP); +// } +// }; + +// template +// class GFX_INST_LUT : public std::array +// { +// public: +// GFX_INST_LUT() +// { +// std::memset(data(), 0, size() * sizeof(int)); +// LUTOVERLOAD(TYPE_VALU); +// LUTOVERLOAD(TYPE_MATRIX); +// LUTOVERLOAD(TYPE_SCALAR); +// LUTOVERLOAD(TYPE_TEX); +// LUTOVERLOAD(TYPE_LDS); +// LUTOVERLOAD(TYPE_LDS_DIRECT); +// LUTOVERLOAD(TYPE_FLAT); +// LUTOVERLOAD(TYPE_EXP); +// LUTOVERLOAD(TYPE_MESSAGE); +// LUTOVERLOAD(TYPE_BARRIER); +// LUTOVERLOAD(TYPE_BRANCH_NOT_TAKEN); +// LUTOVERLOAD(TYPE_BRANCH_TAKEN); +// LUTOVERLOAD(TYPE_JUMP); +// LUTOVERLOAD(TYPE_OTHER); +// LUTOVERLOAD(TYPE_NO_INST); +// LUTOVERLOAD(TYPE_DUAL_VALU); +// } +// }; + +// template +// inline int +// translate_reason(int in) +// { +// static GFX_REASON_LUT lut; +// return lut[in & 0x1F]; +// } + +// template +// inline int +// translate_inst(int in) +// { +// static GFX_INST_LUT lut; +// return lut[in & 0x1F]; +// } + +// #undef LUTOVERLOAD + +// template +// inline rocprofiler_pc_sampling_record_t +// copySample(const void* sample) +// { +// if(HostTrap) return copyHostTrapSample(*(const perf_sample_host_trap_v1*) sample); + +// rocprofiler_pc_sampling_record_t ret = +// copyStochasticSample(*(const perf_sample_snapshot_v1*) sample); + +// ret.snapshot.inst_type = translate_inst(ret.snapshot.inst_type); +// ret.snapshot.arb_state_issue = translate_arb(ret.snapshot.arb_state_issue); +// ret.snapshot.arb_state_stall = translate_arb(ret.snapshot.arb_state_stall); +// ret.snapshot.reason_not_issued = translate_reason(ret.snapshot.reason_not_issued); + +// return ret; +// } + +#define EXTRACT_BITS(val, bit_end, bit_start) \ + (val >> bit_start) & ((1U << (bit_end - bit_start + 1)) - 1) + +template +inline void +copyChipletId(PcSamplingRecordT& record, const SType& sample) +{ + // extract chiplet record + record.hw_id.chiplet = sample.chiplet_and_wave_id >> 8; +} + +template +inline void +copyHwId(HwIdT& hw_id, const uint32_t hsa_hw_id); + +template <> +inline void +copyHwId(rocprofiler_pc_sampling_hw_id_v0_t& hw_id, + const uint32_t hw_id_reg) +{ + // 3:0 -> wave_id + hw_id.wave_id = EXTRACT_BITS(hw_id_reg, 3, 0); + // 5:4 -> simd_id + hw_id.simd_id = EXTRACT_BITS(hw_id_reg, 5, 4); + // 7:6 -> pipe_id; + hw_id.pipe_id = EXTRACT_BITS(hw_id_reg, 7, 6); + // 11:8 -> cu_id + hw_id.cu_or_wgp_id = EXTRACT_BITS(hw_id_reg, 11, 8); + // 12 -> sa_id + hw_id.shader_array_id = EXTRACT_BITS(hw_id_reg, 12, 12); + // 15:13 -> se_id + hw_id.shader_engine_id = EXTRACT_BITS(hw_id_reg, 15, 13); + // 19:16 -> tg_id + hw_id.workgroup_id = EXTRACT_BITS(hw_id_reg, 19, 16); + // 23:20 -> vm_id + hw_id.vm_id = EXTRACT_BITS(hw_id_reg, 23, 20); + // 26:24 -> queue_id + hw_id.queue_id = EXTRACT_BITS(hw_id_reg, 26, 24); + // 29:27 -> state_id (ignored) + // 31:30 -> me_id + hw_id.microengine_id = EXTRACT_BITS(hw_id_reg, 31, 30); +} + +#undef EXTRACT_BITS + +template +inline PcSamplingRecordT copySampleHeader(const SType& sample) { - rocprofiler_pc_sampling_record_t ret; - ret.flags = pcsample_header_v1_t{.raw = 0}.flags; - ret.flags.type = AMD_SNAPSHOT_V1; + PcSamplingRecordT ret; + // zero out all record fields + std::memset(&ret, 0, sizeof(PcSamplingRecordT)); + + // Decode fields common for all host-trap and stochastic on all architectures. + ret.size = sizeof(PcSamplingRecordT); + ret.wave_in_group = sample.chiplet_and_wave_id & 0x3F; ret.exec_mask = sample.exec_mask; ret.workgroup_id.x = sample.workgroup_id_x; ret.workgroup_id.y = sample.workgroup_id_y; ret.workgroup_id.z = sample.workgroup_id_z; - ret.chiplet = sample.chiplet_and_wave_id >> 8; - ret.wave_id = sample.chiplet_and_wave_id & 0x3F; - ret.hw_id = sample.hw_id; ret.timestamp = sample.timestamp; + return ret; } -inline rocprofiler_pc_sampling_record_t -copyHostTrapSample(const perf_sample_host_trap_v1& sample) -{ - rocprofiler_pc_sampling_record_t ret = copySampleHeader(sample); - ret.flags.type = AMD_HOST_TRAP_V1; - return ret; -} - -template -inline rocprofiler_pc_sampling_record_t -copyStochasticSample(const perf_sample_snapshot_v1& sample); +template +inline PcSamplingRecordT +copySample(const void* sample); +/** + * @brief Host trap V0 sample for GFX9 + */ template <> -inline rocprofiler_pc_sampling_record_t -copyStochasticSample(const perf_sample_snapshot_v1& sample) +inline rocprofiler_pc_sampling_record_host_trap_v0_t +copySample(const void* sample) { - rocprofiler_pc_sampling_record_t ret = copySampleHeader(sample); - ret.flags.valid = sample.perf_snapshot_data & (~sample.perf_snapshot_data >> 26) & 0x1; - // Check wave_id matches snapshot_wave_id - - ret.flags.has_wave_cnt = true; - ret.flags.has_stall_reason = true; - - ret.wave_count = sample.perf_snapshot_data1 & 0x3F; - - ret.wave_issued = sample.perf_snapshot_data >> 1; - ret.snapshot.dual_issue_valu = sample.perf_snapshot_data >> 2; - ret.snapshot.inst_type = sample.perf_snapshot_data >> 3; - ret.snapshot.reason_not_issued = (sample.perf_snapshot_data >> 7) & 0x7; - ret.snapshot.arb_state_issue = (sample.perf_snapshot_data >> 10) & 0xFF; - ret.snapshot.arb_state_stall = (sample.perf_snapshot_data >> 18) & 0xFF; - ret.reserved = 0; + const auto& sample_ = *static_cast(sample); + auto ret = copySampleHeader(sample_); + copyChipletId(ret, sample_); + copyHwId(ret.hw_id, sample_.hw_id); + // copyHwId(&ret, sample); return ret; } template <> -inline rocprofiler_pc_sampling_record_t -copyStochasticSample(const perf_sample_snapshot_v1& sample) +inline rocprofiler_pc_sampling_record_stochastic_v0_t +copySample(const void* sample) { - rocprofiler_pc_sampling_record_t ret = copySampleHeader(sample); - ret.flags.valid = sample.perf_snapshot_data & (~sample.perf_snapshot_data >> 23) & 0x1; - // Check wave_id matches snapshot_wave_id - - ret.flags.has_stall_reason = true; - - ret.wave_issued = sample.perf_snapshot_data >> 1; - ret.snapshot.inst_type = sample.perf_snapshot_data >> 2; - ret.snapshot.reason_not_issued = (sample.perf_snapshot_data >> 6) & 0x7; - ret.snapshot.arb_state_issue = (sample.perf_snapshot_data >> 9) & 0x7F; - ret.snapshot.arb_state_stall = (sample.perf_snapshot_data >> 16) & 0x7F; - ret.snapshot.dual_issue_valu = false; - ret.reserved = 0; + const auto& sample_ = *static_cast(sample); + auto ret = copySampleHeader(sample_); + copyChipletId(ret, sample_); + copyHwId(ret.hw_id, sample_.hw_id); + ret.wave_count = sample_.perf_snapshot_data1 & 0x3F; + // TODO: implement logic for manipulating stochastic related fields return ret; } -#define BITSHIFT(sname) out |= ((in >> GFX::sname) & 1) << PCSAMPLE::sname - -template -inline int -translate_arb(int in) +/** + * @brief Host trap V0 sample for GFX11 + */ +template <> +inline rocprofiler_pc_sampling_record_host_trap_v0_t +copySample(const void* sample) { - size_t out = 0; - BITSHIFT(ISSUE_VALU); - BITSHIFT(ISSUE_MATRIX); - BITSHIFT(ISSUE_LDS); - BITSHIFT(ISSUE_LDS_DIRECT); - BITSHIFT(ISSUE_SCALAR); - BITSHIFT(ISSUE_VMEM_TEX); - BITSHIFT(ISSUE_FLAT); - BITSHIFT(ISSUE_EXP); - BITSHIFT(ISSUE_MISC); - BITSHIFT(ISSUE_BRMSG); - return out & 0x3FF; -} - -#undef BITSHIFT - -#define LUTOVERLOAD(sname) this->operator[](GFX::sname) = PCSAMPLE::sname - -template -class GFX_REASON_LUT : public std::array -{ -public: - GFX_REASON_LUT() - { - std::memset(data(), 0, size() * sizeof(int)); - LUTOVERLOAD(REASON_NOT_AVAILABLE); - LUTOVERLOAD(REASON_ALU); - LUTOVERLOAD(REASON_WAITCNT); - LUTOVERLOAD(REASON_INTERNAL); - LUTOVERLOAD(REASON_BARRIER); - LUTOVERLOAD(REASON_ARBITER); - LUTOVERLOAD(REASON_EX_STALL); - LUTOVERLOAD(REASON_OTHER_WAIT); - LUTOVERLOAD(REASON_SLEEP); - } -}; - -template -class GFX_INST_LUT : public std::array -{ -public: - GFX_INST_LUT() - { - std::memset(data(), 0, size() * sizeof(int)); - LUTOVERLOAD(TYPE_VALU); - LUTOVERLOAD(TYPE_MATRIX); - LUTOVERLOAD(TYPE_SCALAR); - LUTOVERLOAD(TYPE_TEX); - LUTOVERLOAD(TYPE_LDS); - LUTOVERLOAD(TYPE_LDS_DIRECT); - LUTOVERLOAD(TYPE_FLAT); - LUTOVERLOAD(TYPE_EXP); - LUTOVERLOAD(TYPE_MESSAGE); - LUTOVERLOAD(TYPE_BARRIER); - LUTOVERLOAD(TYPE_BRANCH_NOT_TAKEN); - LUTOVERLOAD(TYPE_BRANCH_TAKEN); - LUTOVERLOAD(TYPE_JUMP); - LUTOVERLOAD(TYPE_OTHER); - LUTOVERLOAD(TYPE_NO_INST); - LUTOVERLOAD(TYPE_DUAL_VALU); - } -}; - -template -inline int -translate_reason(int in) -{ - static GFX_REASON_LUT lut; - return lut[in & 0x1F]; -} - -template -inline int -translate_inst(int in) -{ - static GFX_INST_LUT lut; - return lut[in & 0x1F]; -} - -#undef LUTOVERLOAD - -template -inline rocprofiler_pc_sampling_record_t -copySample(const void* sample) -{ - if(HostTrap) return copyHostTrapSample(*(const perf_sample_host_trap_v1*) sample); - - rocprofiler_pc_sampling_record_t ret = - copyStochasticSample(*(const perf_sample_snapshot_v1*) sample); - - ret.snapshot.inst_type = translate_inst(ret.snapshot.inst_type); - ret.snapshot.arb_state_issue = translate_arb(ret.snapshot.arb_state_issue); - ret.snapshot.arb_state_stall = translate_arb(ret.snapshot.arb_state_stall); - ret.snapshot.reason_not_issued = translate_reason(ret.snapshot.reason_not_issued); - + const auto& sample_ = *static_cast(sample); + auto ret = copySampleHeader(sample_); + // TODO: decode other fields. + return ret; +} + +// TODO: implement stochastic for GFX11 +template <> +inline rocprofiler_pc_sampling_record_stochastic_v0_t +copySample(const void* sample) +{ + const auto& sample_ = *static_cast(sample); + auto ret = copySampleHeader(sample_); + // TODO: decode other fields + // TODO: implement logic for manipulating stochastic related fields + // ret.wave_count = sample_.perf_snapshot_data1 & 0x3F; return ret; } diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/tests/configure_service.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/tests/configure_service.cpp index a3da98c6ec..8d35de5dc7 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/tests/configure_service.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/tests/configure_service.cpp @@ -202,7 +202,8 @@ test_fail_because_of_wrong_agent(const callback_data* pcs_config->method, pcs_config->unit, pcs_config->min_interval, - cb_data->client_buffer), + cb_data->client_buffer, + 0), ROCPROFILER_STATUS_ERROR_AGENT_NOT_FOUND); } @@ -218,7 +219,8 @@ test_fail_because_of_wrong_context(const callback_data* pcs_config->method, pcs_config->unit, pcs_config->min_interval, - cb_data->client_buffer), + cb_data->client_buffer, + 0), ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_FOUND); } @@ -234,7 +236,8 @@ test_fail_because_of_wrong_buffer(const callback_data* pcs_config->method, pcs_config->unit, pcs_config->min_interval, - not_existing_buffer_id), + not_existing_buffer_id, + 0), ROCPROFILER_STATUS_ERROR_BUFFER_NOT_FOUND); } @@ -254,7 +257,8 @@ test_fail_because_of_unsupported_configuration( pcs_config->method, pcs_config->unit, less_than_min_interval, - cb_data->client_buffer), + cb_data->client_buffer, + 0), ROCPROFILER_STATUS_SUCCESS); EXPECT_NE(rocprofiler_configure_pc_sampling_service(cb_data->client_ctx, @@ -262,7 +266,8 @@ test_fail_because_of_unsupported_configuration( pcs_config->method, pcs_config->unit, greater_than_max_interval, - cb_data->client_buffer), + cb_data->client_buffer, + 0), ROCPROFILER_STATUS_SUCCESS); EXPECT_NE(rocprofiler_configure_pc_sampling_service(cb_data->client_ctx, @@ -270,7 +275,8 @@ test_fail_because_of_unsupported_configuration( wrong_method, pcs_config->unit, pcs_config->max_interval, - cb_data->client_buffer), + cb_data->client_buffer, + 0), ROCPROFILER_STATUS_SUCCESS); EXPECT_NE(rocprofiler_configure_pc_sampling_service(cb_data->client_ctx, @@ -278,7 +284,8 @@ test_fail_because_of_unsupported_configuration( pcs_config->method, wrong_unit, pcs_config->max_interval, - cb_data->client_buffer), + cb_data->client_buffer, + 0), ROCPROFILER_STATUS_SUCCESS); } @@ -293,7 +300,8 @@ test_fail_because_service_is_already_configured( pcs_config->method, pcs_config->unit, pcs_config->min_interval, - cb_data->client_buffer), + cb_data->client_buffer, + 0), ROCPROFILER_STATUS_ERROR_SERVICE_ALREADY_CONFIGURED); } @@ -374,7 +382,8 @@ TEST(pc_sampling, rocprofiler_configure_pc_sampling_service) pcs_config.method, pcs_config.unit, interval, - cb_data->client_buffer), + cb_data->client_buffer, + 0), "Failed to configure PC sampling service"); test_fail_because_service_is_already_configured(cb_data, agent_id, &pcs_config); @@ -385,7 +394,8 @@ TEST(pc_sampling, rocprofiler_configure_pc_sampling_service) pcs_config.method, pcs_config.unit, interval, - another_buff), + another_buff, + 0), ROCPROFILER_STATUS_ERROR); } @@ -452,6 +462,7 @@ TEST(pc_sampling, rocprofiler_configure_pc_sampling_service) ROCPROFILER_PC_SAMPLING_METHOD_HOST_TRAP, ROCPROFILER_PC_SAMPLING_UNIT_TIME, 1, - cb_data.client_buffer), + cb_data.client_buffer, + 0), ROCPROFILER_STATUS_ERROR_CONFIGURATION_LOCKED); } diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/tests/pc_sampling_vs_counter_collection.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/tests/pc_sampling_vs_counter_collection.cpp index e05e90e803..42c6541ae0 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/tests/pc_sampling_vs_counter_collection.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/tests/pc_sampling_vs_counter_collection.cpp @@ -292,7 +292,8 @@ pc_sampling_vs_counter_collection(cc_setup_fn_t cc_setup_fn) pcs_config.method, pcs_config.unit, interval, - cb_data->client_buffer), + cb_data->client_buffer, + 0), ROCPROFILER_STATUS_ERROR_CONTEXT_CONFLICT); } @@ -393,7 +394,8 @@ counter_collection_vs_pc_sampling(cc_setup_fn_t cc_setup_fn) pcs_config.method, pcs_config.unit, interval, - cb_data->client_buffer), + cb_data->client_buffer, + 0), ROCPROFILER_STATUS_SUCCESS); } diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/tests/query_configuration.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/tests/query_configuration.cpp index ad73a21b6d..4ac8058eee 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/tests/query_configuration.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/tests/query_configuration.cpp @@ -285,7 +285,8 @@ TEST(pc_sampling, query_configs_after_service_setup) pcs_config.method, pcs_config.unit, interval, - cb_data->client_buffer), + cb_data->client_buffer, + 0), "Failed to configure PC sampling service"); // query configuration and expect to see `pcs_config->max_interval` as the `interval` diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/tests/samples_processing.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/tests/samples_processing.cpp index 501246cbdb..ea1ccc634f 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/tests/samples_processing.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/tests/samples_processing.cpp @@ -251,7 +251,8 @@ TEST(pc_sampling, processing_pc_samples) pcs_config.method, pcs_config.unit, interval, - cb_data->client_buffer), + cb_data->client_buffer, + 0), "Failed to configure PC sampling service"); ROCPROFILER_CALL(rocprofiler_create_callback_thread(&cb_data->client_thread), diff --git a/projects/rocprofiler-sdk/tests/pc_sampling/pcs.cpp b/projects/rocprofiler-sdk/tests/pc_sampling/pcs.cpp index 3b9718aba0..e660744e6c 100644 --- a/projects/rocprofiler-sdk/tests/pc_sampling/pcs.cpp +++ b/projects/rocprofiler-sdk/tests/pc_sampling/pcs.cpp @@ -266,7 +266,8 @@ configure_pc_sampling_prefer_stochastic(tool_agent_info* agent_info, picked_cfg->method, picked_cfg->unit, interval, - buffer_id); + buffer_id, + 0); if(status == ROCPROFILER_STATUS_SUCCESS) { *utils::get_output_stream() @@ -334,24 +335,25 @@ rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /*context_id*/, } else if(cur_header->category == ROCPROFILER_BUFFER_CATEGORY_PC_SAMPLING) { - if(cur_header->kind == ROCPROFILER_PC_SAMPLING_RECORD_SAMPLE) + if(cur_header->kind == ROCPROFILER_PC_SAMPLING_RECORD_HOST_TRAP_V0_SAMPLE) { - auto* pc_sample = - static_cast(cur_header->payload); + auto* pc_sample = static_cast( + cur_header->payload); - ss << "(code_obj_id, offset): (" << pc_sample->pc.loaded_code_object_id - << ", 0x" << std::hex << pc_sample->pc.loaded_code_object_offset << "), " + ss << "(code_obj_id, offset): (" << pc_sample->pc.code_object_id << ", 0x" + << std::hex << pc_sample->pc.code_object_offset << "), " << "timestamp: " << std::dec << pc_sample->timestamp << ", " << "exec: " << std::hex << std::setw(16) << pc_sample->exec_mask << ", " << "workgroup_id_(x=" << std::dec << std::setw(5) << pc_sample->workgroup_id.x << ", " << "y=" << std::setw(5) << pc_sample->workgroup_id.y << ", " << "z=" << std::setw(5) << pc_sample->workgroup_id.z << "), " - << "wave_id: " << std::setw(2) - << static_cast(pc_sample->wave_id) << ", " + << "wave_in_group: " << std::setw(2) + << static_cast(pc_sample->wave_in_group) << ", " << "chiplet: " << std::setw(2) - << static_cast(pc_sample->chiplet) << ", " - << "cu_id: " << pc_sample->hw_id << ", " + << static_cast(pc_sample->hw_id.chiplet) + << ", " + // << "cu_id: " << pc_sample->hw_id << ", " << "correlation: {internal=" << std::setw(7) << pc_sample->correlation_id.internal << ", " << "external=" << std::setw(5) << pc_sample->correlation_id.external.value @@ -373,8 +375,8 @@ rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /*context_id*/, assert(corr_id.external.value > 0); // Decoding the PC - auto inst = translator.get(pc_sample->pc.loaded_code_object_id, - pc_sample->pc.loaded_code_object_offset); + auto inst = translator.get(pc_sample->pc.code_object_id, + pc_sample->pc.code_object_offset); flat_profile.add_sample(std::move(inst), pc_sample->exec_mask); } else