From 0ca07105a301ad6a176a3f0e4935b82f9bd3e0ae Mon Sep 17 00:00:00 2001 From: "Indic, Vladimir" Date: Fri, 21 Mar 2025 20:40:45 +0100 Subject: [PATCH] [SDK][rocprofv3] MI300 Stochastic PC sampling (#92) * MI300 Stochastic PC sampling SDK API implementation * ROCProfV3: Stochastic PC sampling Support (#94) * ROCProfV3: MI300 Stochastic PC sampling initial draft * ROCProfV3: Initial Stochastic PC sampling Tests (#95) ROCProfV3: Initial Stochastic PC sampling tests * Update rocprofiler_pc_sampling_record_stochastic_v0_t - update doxygen docs for members - replace rocprofiler_correlation_id_t with rocprofiler_async_correlation_id_t * Relax the check in JSON tests * drain PC sampling buffer during finalize_rocprofv3 * Increase timeout for "Test Install Build" step - 10 minutes -> 20 minutes - "Test Installed Packages" has 20 minutes so "Test Install Build" should also --------- Co-authored-by: Jonathan R. Madsen [ROCm/rocprofiler-sdk commit: 49ce79a5b51a016b1c86eb67011e44570ce75bd7] --- .../workflows/continuous_integration.yml | 6 +- .../samples/pc_sampling/pcs.cpp | 160 ++++-- .../samples/pc_sampling/utils.cpp | 14 + .../samples/pc_sampling/utils.hpp | 5 +- .../rocprofiler-sdk/cxx/serialization.hpp | 88 +++ .../source/include/rocprofiler-sdk/fwd.h | 3 +- .../include/rocprofiler-sdk/pc_sampling.h | 265 ++++++++- .../source/lib/output/buffered_output.hpp | 3 + .../rocprofiler-sdk/source/lib/output/csv.hpp | 1 + .../source/lib/output/domain_type.cpp | 4 + .../source/lib/output/domain_type.hpp | 1 + .../source/lib/output/generateCSV.cpp | 73 +++ .../source/lib/output/generateCSV.hpp | 6 + .../source/lib/output/generateJSON.cpp | 9 +- .../source/lib/output/generateJSON.hpp | 6 +- .../source/lib/output/generateStats.cpp | 16 + .../source/lib/output/generateStats.hpp | 6 + .../source/lib/output/metadata.hpp | 4 + .../source/lib/output/pc_sample_transform.hpp | 28 + .../lib/rocprofiler-sdk-tool/config.cpp | 22 +- .../lib/rocprofiler-sdk-tool/config.hpp | 1 + .../source/lib/rocprofiler-sdk-tool/tool.cpp | 131 +++-- .../lib/rocprofiler-sdk/pc_sampling.cpp | 99 ++++ .../pc_sampling/ioctl/ioctl_adapter.cpp | 275 ++++++--- .../pc_sampling/ioctl/ioctl_adapter_types.hpp | 26 +- .../pc_sampling/parser/CMakeLists.txt | 2 +- .../pc_sampling/parser/correlation.hpp | 2 + .../pc_sampling/parser/gfx11.hpp | 16 +- .../pc_sampling/parser/gfx9.hpp | 33 +- .../parser/pc_record_interface.cpp | 42 +- .../parser/pc_record_interface.hpp | 2 +- .../pc_sampling/parser/tests/gfx9test.cpp | 543 ++++++++++-------- .../pc_sampling/parser/tests/mocks.hpp | 2 + .../pc_sampling/parser/tests/multigpu.cpp | 1 + .../pc_sampling/parser/translation.hpp | 208 ++++--- .../rocprofiler-sdk/pc_sampling/service.cpp | 41 +- .../rocprofiler-sdk/pc_sampling/service.hpp | 6 + .../pc_sampling/tests/configure_service.cpp | 80 ++- .../lib/rocprofiler-sdk/registration.cpp | 3 + .../tests/pc_sampling/address_translation.cpp | 24 +- .../tests/pc_sampling/address_translation.hpp | 53 +- .../rocprofiler-sdk/tests/pc_sampling/pcs.cpp | 223 ++++--- .../rocprofiler-sdk/tests/pc_sampling/pcs.hpp | 3 - .../tests/pc_sampling/utils.cpp | 14 + .../tests/pc_sampling/utils.hpp | 5 +- .../tests/pytest-packages/CMakeLists.txt | 2 + .../pc_sampling/CMakeLists.txt | 18 + .../exec_mask_manipulation/CMakeLists.txt | 14 + .../exec_mask_manipulation/__init__.py | 23 + .../pc_sampling/exec_mask_manipulation/csv.py | 210 +++++++ .../exec_mask_manipulation/json.py | 244 ++++++++ .../pc_sampling/stochastic/CMakeLists.txt | 17 + .../pc_sampling/stochastic/__init__.py | 24 + .../pc_sampling/stochastic/csv/CMakeLists.txt | 16 + .../pc_sampling/stochastic/csv/__init__.py | 24 + .../stochastic/csv/gfx9/CMakeLists.txt | 18 + .../stochastic/csv/gfx9/__init__.py | 110 ++++ .../stochastic/csv/gfx9/flat_instructions.py | 74 +++ .../stochastic/csv/gfx9/lds_instructions.py | 67 +++ .../csv/gfx9/matrix_instructions.py | 107 ++++ .../csv/gfx9/s_instructions/CMakeLists.txt | 23 + .../csv/gfx9/s_instructions/__init__.py | 151 +++++ .../s_instructions/barrier_instructions.py | 65 +++ .../s_instructions/branch_instructions.py | 142 +++++ .../s_instructions/internal_instructions.py | 38 ++ .../gfx9/s_instructions/jump_instructions.py | 60 ++ .../s_instructions/message_instructions.py | 64 +++ .../gfx9/s_instructions/other_instructions.py | 64 +++ .../s_instructions/scalar_instructions.py | 70 +++ .../csv/gfx9/s_instructions/waitcnt.py | 45 ++ .../csv/gfx9/texture_instructions.py | 74 +++ .../stochastic/csv/gfx9/valu_instructions.py | 69 +++ .../stochastic/json/CMakeLists.txt | 16 + .../pc_sampling/stochastic/json/__init__.py | 24 + .../stochastic/json/gfx9/CMakeLists.txt | 17 + .../stochastic/json/gfx9/__init__.py | 176 ++++++ .../stochastic/json/gfx9/arbiter_state.py | 104 ++++ .../json/gfx9/other_instructions.py | 160 ++++++ .../stochastic/json/gfx9/s_instructions.py | 222 +++++++ .../transpose_multiple_agents/CMakeLists.txt | 14 + .../transpose_multiple_agents/__init__.py | 23 + .../transpose_multiple_agents/csv.py | 93 +++ .../rocprofv3/pc-sampling/CMakeLists.txt | 1 + .../exec-mask-manipulation/validate.py | 357 +----------- .../transpose-multiple-agents/validate.py | 54 +- .../pc-sampling/stochastic/CMakeLists.txt | 6 + .../exec-mask-manipulation/CMakeLists.txt | 153 +++++ .../exec-mask-manipulation/conftest.py | 70 +++ .../exec-mask-manipulation/input.json | 14 + .../exec-mask-manipulation/input.yml | 8 + .../exec-mask-manipulation/pytest.ini | 5 + .../exec-mask-manipulation/validate.py | 62 ++ .../transpose-multiple-agents/CMakeLists.txt | 171 ++++++ .../transpose-multiple-agents/conftest.py | 84 +++ .../transpose-multiple-agents/input.json | 15 + .../transpose-multiple-agents/input.yml | 9 + .../transpose-multiple-agents/pytest.ini | 5 + .../transpose-multiple-agents/validate.py | 46 ++ 98 files changed, 5266 insertions(+), 1031 deletions(-) create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/exec_mask_manipulation/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/exec_mask_manipulation/__init__.py create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/exec_mask_manipulation/csv.py create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/exec_mask_manipulation/json.py create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/__init__.py create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/__init__.py create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/__init__.py create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/flat_instructions.py create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/lds_instructions.py create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/matrix_instructions.py create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/__init__.py create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/barrier_instructions.py create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/branch_instructions.py create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/internal_instructions.py create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/jump_instructions.py create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/message_instructions.py create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/other_instructions.py create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/scalar_instructions.py create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/waitcnt.py create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/texture_instructions.py create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/valu_instructions.py create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/__init__.py create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/gfx9/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/gfx9/__init__.py create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/gfx9/arbiter_state.py create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/gfx9/other_instructions.py create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/gfx9/s_instructions.py create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/transpose_multiple_agents/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/transpose_multiple_agents/__init__.py create mode 100644 projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/transpose_multiple_agents/csv.py create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/exec-mask-manipulation/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/exec-mask-manipulation/conftest.py create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/exec-mask-manipulation/input.json create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/exec-mask-manipulation/input.yml create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/exec-mask-manipulation/pytest.ini create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/exec-mask-manipulation/validate.py create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/transpose-multiple-agents/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/transpose-multiple-agents/conftest.py create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/transpose-multiple-agents/input.json create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/transpose-multiple-agents/input.yml create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/transpose-multiple-agents/pytest.ini create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/transpose-multiple-agents/validate.py diff --git a/projects/rocprofiler-sdk/.github/workflows/continuous_integration.yml b/projects/rocprofiler-sdk/.github/workflows/continuous_integration.yml index a475ec9be1..354fa8e2cb 100644 --- a/projects/rocprofiler-sdk/.github/workflows/continuous_integration.yml +++ b/projects/rocprofiler-sdk/.github/workflows/continuous_integration.yml @@ -32,7 +32,7 @@ env: navi4_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate))$" navi3_EXCLUDE_LABEL_REGEX: "^(pc-sampling|pc_sampling|openmp-target)$" vega20_EXCLUDE_LABEL_REGEX: "^(pc-sampling|pc_sampling|openmp-target)$" - mi200_EXCLUDE_LABEL_REGEX: "" + mi200_EXCLUDE_LABEL_REGEX: "^(stochastic)$" mi300_EXCLUDE_LABEL_REGEX: "^(pc-sampling|pc_sampling)$" mi300a_EXCLUDE_LABEL_REGEX: "" mi325_EXCLUDE_LABEL_REGEX: "^(pc-sampling|pc_sampling)$" @@ -124,7 +124,7 @@ jobs: - name: Test Install Build if: ${{ contains(matrix.runner, env.CORE_EXT_RUNNER) }} - timeout-minutes: 10 + timeout-minutes: 20 shell: bash run: | CMAKE_PREFIX_PATH=/opt/rocprofiler-sdk cmake -B build-samples samples @@ -249,7 +249,7 @@ jobs: - name: Test Install Build if: ${{ contains(matrix.runner, env.CORE_EXT_RUNNER) }} - timeout-minutes: 10 + timeout-minutes: 20 shell: bash run: | CMAKE_PREFIX_PATH=/opt/rocprofiler-sdk cmake -B build-samples samples diff --git a/projects/rocprofiler-sdk/samples/pc_sampling/pcs.cpp b/projects/rocprofiler-sdk/samples/pc_sampling/pcs.cpp index d782544b75..31fed399f5 100644 --- a/projects/rocprofiler-sdk/samples/pc_sampling/pcs.cpp +++ b/projects/rocprofiler-sdk/samples/pc_sampling/pcs.cpp @@ -43,7 +43,7 @@ namespace pcs // TODO: Since this is used only within the `tool_init`, // we are safe using static constructor. // It would be nice to make this consistent with the `buffer_ids`. -tool_agent_info_vec_t gpu_agents; +tool_agent_info_vec_t gpu_agents = {}; // The reason for using raw pointers is the following. // Sometimes, statically created objects of the client::pcs // namespace might be freed prior to the `tool_fini`, @@ -55,6 +55,12 @@ tool_agent_info_vec_t gpu_agents; // `pcs` namespace and export functions for registering/flushing/destroying buffers. pc_sampling_buffer_id_vec_t* buffer_ids = nullptr; +namespace +{ +constexpr uint64_t host_trap_interval = 10000; // 10ms +constexpr uint64_t stochastic_interval = 1048576; // 2 ^ 20 cycles +} // namespace + void init() { @@ -67,6 +73,7 @@ fini() // Clear the data buffer_ids->clear(); delete buffer_ids; + buffer_ids = nullptr; } pc_sampling_buffer_id_vec_t* @@ -112,7 +119,7 @@ find_all_gpu_agents_supporting_pc_sampling_impl(rocprofiler_agent_version_t vers << "type=" << _agents[i]->type << "\n"; } - *utils::get_output_stream() << ss.str() << std::endl; + *utils::get_output_stream() << ss.str() << "\n"; return ROCPROFILER_STATUS_SUCCESS; } @@ -161,8 +168,8 @@ query_avail_configs_for_agent(tool_agent_info* agent_info) // The query operation failed, so consider the PC sampling is unsupported at the agent. // This can happen if the PC sampling service is invoked within the ROCgdb. ss << "Querying PC sampling capabilities failed with status=" << status - << " :: " << rocprofiler_get_status_string(status) << std::endl; - *utils::get_output_stream() << ss.str() << std::endl; + << " :: " << rocprofiler_get_status_string(status) << "\n"; + *utils::get_output_stream() << ss.str() << "\n"; return false; } else if(agent_info->avail_configs->empty()) @@ -172,7 +179,8 @@ query_avail_configs_for_agent(tool_agent_info* agent_info) } ss << "The agent with the id: " << agent_info->agent_id.handle << " supports the " - << agent_info->avail_configs->size() << " configurations: " << std::endl; + << agent_info->avail_configs->size() << " configurations: " + << "\n"; size_t ind = 0; for(auto& cfg : *agent_info->avail_configs) { @@ -181,7 +189,11 @@ query_avail_configs_for_agent(tool_agent_info* agent_info) << "unit: " << cfg.unit << ", " << "min_interval: " << cfg.min_interval << ", " << "max_interval: " << cfg.max_interval << ", " - << "flags: " << std::hex << cfg.flags << std::dec << std::endl; + << "flags: " << std::hex << cfg.flags << std::dec + << ((cfg.flags == ROCPROFILER_PC_SAMPLING_CONFIGURATION_FLAGS_INTERVAL_POW2) + ? " (an interval value must be power of 2)" + : "") + << "\n"; } *utils::get_output_stream() << ss.str() << std::flush; @@ -194,8 +206,9 @@ configure_pc_sampling_prefer_stochastic(tool_agent_info* agent_info, rocprofiler_context_id_t context_id, rocprofiler_buffer_id_t buffer_id) { - int failures = 10; - size_t interval = 0; + auto stochastic_picked = false; + int failures = 10; + size_t interval = 0; do { // Update the list of available configurations @@ -216,9 +229,9 @@ configure_pc_sampling_prefer_stochastic(tool_agent_info* agent_info, { if(cfg.method == ROCPROFILER_PC_SAMPLING_METHOD_STOCHASTIC) { - // Temporarily disable stochastic sampling as it's not fully supported. - // first_stochastic_config = &cfg; - // break; + first_stochastic_config = &cfg; + stochastic_picked = true; + break; } else if(!first_host_trap_config && cfg.method == ROCPROFILER_PC_SAMPLING_METHOD_HOST_TRAP) @@ -238,7 +251,7 @@ configure_pc_sampling_prefer_stochastic(tool_agent_info* agent_info, } else { - interval = 10000; + interval = stochastic_picked ? stochastic_interval : host_trap_interval; } auto status = rocprofiler_configure_pc_sampling_service(context_id, @@ -251,8 +264,10 @@ configure_pc_sampling_prefer_stochastic(tool_agent_info* agent_info, if(status == ROCPROFILER_STATUS_SUCCESS) { *utils::get_output_stream() - << ">>> We chose PC sampling interval: " << interval - << " on the agent: " << agent_info->agent->id.handle << std::endl; + << ">>> We chose " << (stochastic_picked ? "stochastic" : "Host-Trap") + << " PC sampling with the interval: " << interval << " " + << (stochastic_picked ? "clock-cycles" : "micro seconds") + << " on the agent: " << agent_info->agent->id.handle << "\n"; return; } else if(status != ROCPROFILER_STATUS_ERROR_NOT_AVAILABLE) @@ -279,6 +294,87 @@ configure_pc_sampling_prefer_stochastic(tool_agent_info* agent_info, ROCPROFILER_CHECK(ROCPROFILER_STATUS_ERROR); } +template +void +print_sample_common_fields(std::ostream& os, const PcSamplingRecordT* pc_sample) +{ + os << "(code_obj_id, offset): (" << pc_sample->pc.code_object_id << ", 0x" << std::hex + << pc_sample->pc.code_object_offset << "), " + << "timestamp: " << std::dec << pc_sample->timestamp << ", " + << "exec: " << std::hex << std::setw(16) << pc_sample->exec_mask << ", " + << "workgroup_id_(x=" << std::dec << std::setw(5) << pc_sample->workgroup_id.x << ", " + << "y=" << std::setw(5) << pc_sample->workgroup_id.y << ", " + << "z=" << std::setw(5) << pc_sample->workgroup_id.z << "), " + << "wave_in_group: " << std::setw(2) << static_cast(pc_sample->wave_in_group) + << ", " + << "chiplet: " << std::setw(2) << static_cast(pc_sample->hw_id.chiplet) << ", " + << "dispatch_id: " << std::setw(7) << pc_sample->dispatch_id << "," + << "correlation: {internal=" << std::setw(7) << pc_sample->correlation_id.internal << ", " + << "external=" << std::setw(5) << pc_sample->correlation_id.external.value << "}, "; +} + +void +print_sample(std::ostream& os, const rocprofiler_pc_sampling_record_host_trap_v0_t* sample) +{ + print_sample_common_fields(os, sample); + os << "\n"; +} + +void +print_sample(std::ostream& os, const rocprofiler_pc_sampling_record_stochastic_v0_t* sample) +{ + print_sample_common_fields(os, sample); + + if(sample->wave_issued) + { + auto* inst_c_str = rocprofiler_get_pc_sampling_instruction_type_name( + static_cast(sample->inst_type)); + utils::pcs_assert(inst_c_str != nullptr, "Invalid instruction type"); + os << "wave issued " << std::string(inst_c_str) << " instruction, "; + } + else + { + auto* reason_c_str = rocprofiler_get_pc_sampling_instruction_not_issued_reason_name( + static_cast( + sample->snapshot.reason_not_issued)); + utils::pcs_assert(reason_c_str != nullptr, "Invalid not issued reason"); + os << "wave is stalled due to: " << std::string(reason_c_str) << " reason, "; + } + + auto snapshot = sample->snapshot; + os << "two VALU instructions issued: " << static_cast(snapshot.dual_issue_valu) + << ", "; + + os << "arbiter state: {pipe issued: (" + << "VALU: " << static_cast(snapshot.arb_state_issue_valu) << ", " + << "MATRIX: " << static_cast(snapshot.arb_state_issue_matrix) << ", " + << "LDS: " << static_cast(snapshot.arb_state_issue_lds) << ", " + << "LDS_DIRECT: " << static_cast(snapshot.arb_state_issue_lds_direct) << ", " + << "SCALAR: " << static_cast(snapshot.arb_state_issue_scalar) << ", " + << "TEX: " << static_cast(snapshot.arb_state_issue_vmem_tex) << ", " + << "FLAT: " << static_cast(snapshot.arb_state_issue_flat) << ", " + << "EXPORT: " << static_cast(snapshot.arb_state_issue_exp) << ", " + << "MISC: " << static_cast(snapshot.arb_state_issue_misc) << "), " + << "pipe stalled: (" + << "VALU: " << static_cast(snapshot.arb_state_stall_valu) << ", " + << "MATRIX: " << static_cast(snapshot.arb_state_stall_matrix) << ", " + << "LDS: " << static_cast(snapshot.arb_state_stall_lds) << ", " + << "LDS_DIRECT: " << static_cast(snapshot.arb_state_stall_lds_direct) << ", " + << "SCALAR: " << static_cast(snapshot.arb_state_stall_scalar) << ", " + << "TEX: " << static_cast(snapshot.arb_state_stall_vmem_tex) << ", " + << "FLAT: " << static_cast(snapshot.arb_state_stall_flat) << ", " + << "EXPORT: " << static_cast(snapshot.arb_state_stall_exp) << ", " + << "MISC: " << static_cast(snapshot.arb_state_stall_misc) << ")}"; + + os << "\n"; +} + +void +print_sample(std::ostream& os, const rocprofiler_pc_sampling_record_invalid_t* /*sample*/) +{ + os << "Invalid sample detected.\n"; +} + void rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /*context_id*/, rocprofiler_buffer_id_t /*buffer_id*/, @@ -289,7 +385,7 @@ rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /*context_id*/, { std::stringstream ss; ss << "The number of delivered samples is: " << num_headers << ", " - << "while the number of dropped samples is: " << drop_count << std::endl; + << "while the number of dropped samples is: " << drop_count << "\n"; for(size_t i = 0; i < num_headers; i++) { @@ -312,23 +408,21 @@ rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /*context_id*/, auto* pc_sample = static_cast( cur_header->payload); - ss << "(code_obj_id, offset): (" << pc_sample->pc.code_object_id << ", 0x" - << std::hex << pc_sample->pc.code_object_offset << "), " - << "timestamp: " << std::dec << pc_sample->timestamp << ", " - << "exec: " << std::hex << std::setw(16) << pc_sample->exec_mask << ", " - << "workgroup_id_(x=" << std::dec << std::setw(5) << pc_sample->workgroup_id.x - << ", " - << "y=" << std::setw(5) << pc_sample->workgroup_id.y << ", " - << "z=" << std::setw(5) << pc_sample->workgroup_id.z << "), " - << "wave_in_group: " << std::setw(2) - << static_cast(pc_sample->wave_in_group) << ", " - << "chiplet: " << std::setw(2) - << static_cast(pc_sample->hw_id.chiplet) << ", " - << "dispatch_id: " << std::setw(7) << pc_sample->dispatch_id << "," - << "correlation: {internal=" << std::setw(7) - << pc_sample->correlation_id.internal << ", " - << "external=" << std::setw(5) << pc_sample->correlation_id.external.value << "}" - << std::endl; + print_sample(ss, pc_sample); + } + else if(cur_header->kind == ROCPROFILER_PC_SAMPLING_RECORD_STOCHASTIC_V0_SAMPLE) + { + auto* pc_sample = static_cast( + cur_header->payload); + + print_sample(ss, pc_sample); + } + else if(cur_header->kind == ROCPROFILER_PC_SAMPLING_RECORD_INVALID_SAMPLE) + { + auto* pc_sample = + static_cast(cur_header->payload); + + print_sample(ss, pc_sample); } else { @@ -341,7 +435,7 @@ rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /*context_id*/, } } - *utils::get_output_stream() << ss.str() << std::endl; + *utils::get_output_stream() << ss.str() << "\n"; } } // namespace pcs } // namespace client diff --git a/projects/rocprofiler-sdk/samples/pc_sampling/utils.cpp b/projects/rocprofiler-sdk/samples/pc_sampling/utils.cpp index dfe66925fa..3d2b79f37d 100644 --- a/projects/rocprofiler-sdk/samples/pc_sampling/utils.cpp +++ b/projects/rocprofiler-sdk/samples/pc_sampling/utils.cpp @@ -33,5 +33,19 @@ get_output_stream() static std::ostream* _v = nullptr; return _v; } + +/** + * @brief Shows @p error_msg and aborts if @p condition is false. + * + */ +void +pcs_assert(bool condition, std::string_view error_msg) +{ + if(!condition) + { + std::cerr << "PC Sampling Assertion Error: " << error_msg << "\n"; + abort(); + } +} } // namespace utils } // namespace client diff --git a/projects/rocprofiler-sdk/samples/pc_sampling/utils.hpp b/projects/rocprofiler-sdk/samples/pc_sampling/utils.hpp index 4d2f28eab3..cd606cff0e 100644 --- a/projects/rocprofiler-sdk/samples/pc_sampling/utils.hpp +++ b/projects/rocprofiler-sdk/samples/pc_sampling/utils.hpp @@ -32,5 +32,8 @@ namespace utils { std::ostream*& get_output_stream(); -} + +void +pcs_assert(bool condition, std::string_view error_msg); +} // namespace utils } // namespace client diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization.hpp b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization.hpp index 4ab8ff6a08..9a8723746b 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization.hpp +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization.hpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -930,6 +931,93 @@ save(ArchiveT& ar, rocprofiler_pc_sampling_record_host_trap_v0_t data) ROCP_SDK_SAVE_DATA_BITFIELD("wave_in_grp", wave_in_group); } +template +void +save(ArchiveT& ar, rocprofiler_pc_sampling_record_stochastic_header_t data) +{ + ROCP_SDK_SAVE_DATA_BITFIELD("has_mem_cnt", has_memory_counter); +} + +template +void +save_pc_sampling_inst_type(ArchiveT& ar, rocprofiler_pc_sampling_instruction_type_t inst_type) +{ + ar(make_nvp("inst_type", + std::string(rocprofiler_get_pc_sampling_instruction_type_name(inst_type)))); +} + +template +void +save_pc_sampling_stall_reason(ArchiveT& ar, + rocprofiler_pc_sampling_instruction_not_issued_reason_t stall_reason) +{ + ar(make_nvp( + "stall_reason", + std::string(rocprofiler_get_pc_sampling_instruction_not_issued_reason_name(stall_reason)))); +} + +template +void +save(ArchiveT& ar, rocprofiler_pc_sampling_snapshot_v0_t data) +{ + save_pc_sampling_stall_reason( + ar, + static_cast( + data.reason_not_issued)); + + ROCP_SDK_SAVE_DATA_BITFIELD("dual_issue_valu", dual_issue_valu); + + // Arb state (pipe issued) + ROCP_SDK_SAVE_DATA_BITFIELD("arb_state_issue_valu", arb_state_issue_valu); + ROCP_SDK_SAVE_DATA_BITFIELD("arb_state_issue_matrix", arb_state_issue_matrix); + ROCP_SDK_SAVE_DATA_BITFIELD("arb_state_issue_lds", arb_state_issue_lds); + ROCP_SDK_SAVE_DATA_BITFIELD("arb_state_issue_lds_direct", arb_state_issue_lds_direct); + ROCP_SDK_SAVE_DATA_BITFIELD("arb_state_issue_scalar", arb_state_issue_scalar); + ROCP_SDK_SAVE_DATA_BITFIELD("arb_state_issue_vmem_tex", arb_state_issue_vmem_tex); + ROCP_SDK_SAVE_DATA_BITFIELD("arb_state_issue_flat", arb_state_issue_flat); + ROCP_SDK_SAVE_DATA_BITFIELD("arb_state_issue_exp", arb_state_issue_exp); + ROCP_SDK_SAVE_DATA_BITFIELD("arb_state_issue_misc", arb_state_issue_misc); + ROCP_SDK_SAVE_DATA_BITFIELD("arb_state_issue_brmsg", arb_state_issue_brmsg); + // Arb state (pipe stalled) + ROCP_SDK_SAVE_DATA_BITFIELD("arb_state_stall_valu", arb_state_stall_valu); + ROCP_SDK_SAVE_DATA_BITFIELD("arb_state_stall_matrix", arb_state_stall_matrix); + ROCP_SDK_SAVE_DATA_BITFIELD("arb_state_stall_lds", arb_state_stall_lds); + ROCP_SDK_SAVE_DATA_BITFIELD("arb_state_stall_lds_direct", arb_state_stall_lds_direct); + ROCP_SDK_SAVE_DATA_BITFIELD("arb_state_stall_scalar", arb_state_stall_scalar); + ROCP_SDK_SAVE_DATA_BITFIELD("arb_state_stall_vmem_tex", arb_state_stall_vmem_tex); + ROCP_SDK_SAVE_DATA_BITFIELD("arb_state_stall_flat", arb_state_stall_flat); + ROCP_SDK_SAVE_DATA_BITFIELD("arb_state_stall_exp", arb_state_stall_exp); + ROCP_SDK_SAVE_DATA_BITFIELD("arb_state_stall_misc", arb_state_stall_misc); + ROCP_SDK_SAVE_DATA_BITFIELD("arb_state_stall_brmsg", arb_state_stall_brmsg); +} + +template +void +save(ArchiveT& ar, rocprofiler_pc_sampling_record_stochastic_v0_t data) +{ + // flags specific for stochastic sampling + ROCP_SDK_SAVE_DATA_FIELD(flags); + + // Common for host-trap and stochastic + ROCP_SDK_SAVE_DATA_FIELD(hw_id); + ROCP_SDK_SAVE_DATA_FIELD(pc); + ROCP_SDK_SAVE_DATA_FIELD(exec_mask); + ROCP_SDK_SAVE_DATA_FIELD(timestamp); + ROCP_SDK_SAVE_DATA_FIELD(dispatch_id); + ROCP_SDK_SAVE_DATA_VALUE("corr_id", correlation_id); + ROCP_SDK_SAVE_DATA_VALUE("wrkgrp_id", workgroup_id); + ROCP_SDK_SAVE_DATA_BITFIELD("wave_in_grp", wave_in_group); + + // fields specific for stochastic + ROCP_SDK_SAVE_DATA_BITFIELD("wave_issued", wave_issued); + save_pc_sampling_inst_type( + ar, static_cast(data.inst_type)); + ROCP_SDK_SAVE_DATA_BITFIELD("wave_cnt", wave_count); + ROCP_SDK_SAVE_DATA_FIELD(snapshot); + + // TODO: add memory counters +} + template void save(ArchiveT& ar, rocprofiler_agent_io_link_t data) diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h index 71130ef52a..b5a634b7df 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h @@ -475,8 +475,9 @@ typedef enum typedef enum { ROCPROFILER_PC_SAMPLING_RECORD_NONE = 0, + ROCPROFILER_PC_SAMPLING_RECORD_INVALID_SAMPLE, ///< ::rocprofiler_pc_sampling_record_invalid_t ROCPROFILER_PC_SAMPLING_RECORD_HOST_TRAP_V0_SAMPLE, ///< ::rocprofiler_pc_sampling_record_host_trap_v0_t - ROCPROFILER_PC_SAMPLING_RECORD_STOCHASTIC_V0_SAMPLE, ///< for the future use + ROCPROFILER_PC_SAMPLING_RECORD_STOCHASTIC_V0_SAMPLE, ///< ::rocprofiler_pc_sampling_record_stochastic_v0_t ROCPROFILER_PC_SAMPLING_RECORD_LAST, } rocprofiler_pc_sampling_record_kind_t; diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/pc_sampling.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/pc_sampling.h index b366d8b801..40d8293666 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/pc_sampling.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/pc_sampling.h @@ -106,11 +106,13 @@ ROCPROFILER_EXTERN_C_INIT * 1. PC sampling is already configured with configuration different than requested, * 2. PC sampling is requested from a process that runs within the ROCgdb. * 3. HSA runtime does not support PC sampling. + * 4. GPU device does not support requested PC sampling method. * @retval ::ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL the amdgpu driver installed on the system * does not support the PC sampling feature * @retval ::ROCPROFILER_STATUS_ERROR a general error caused by the amdgpu driver * @retval ::ROCPROFILER_STATUS_ERROR_CONTEXT_CONFLICT counter collection service already * setup in the context + * @retval ::ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT function invoked with an invalid argument */ rocprofiler_status_t rocprofiler_configure_pc_sampling_service(rocprofiler_context_id_t context_id, @@ -121,6 +123,19 @@ rocprofiler_configure_pc_sampling_service(rocprofiler_context_id_t conte rocprofiler_buffer_id_t buffer_id, int flags) ROCPROFILER_API; +/** + * @brief Enumeration describing values of flags of ::rocprofiler_pc_sampling_configuration_t. + */ +typedef enum rocprofiler_pc_sampling_configuration_flags_t +{ + ROCPROFILER_PC_SAMPLING_CONFIGURATION_FLAGS_NONE = 0, + ROCPROFILER_PC_SAMPLING_CONFIGURATION_FLAGS_INTERVAL_POW2, + ROCPROFILER_PC_SAMPLING_CONFIGURATION_FLAGS_LAST + + /// @var ROCPROFILER_PC_SAMPLING_CONFIGURATION_FLAGS_INTERVAL_POW2 + /// @brief The interval value must be a power of 2. +} rocprofiler_pc_sampling_configuration_flags_t; + /** * @brief PC sampling configuration supported by a GPU agent. */ @@ -131,7 +146,7 @@ typedef struct rocprofiler_pc_sampling_unit_t unit; size_t min_interval; size_t max_interval; - uint64_t flags; /// for future use + uint64_t flags; ///< take values from ::rocprofiler_pc_sampling_configuration_flags_t /// @var method /// @brief Sampling method supported by the GPU agent. @@ -202,12 +217,11 @@ rocprofiler_query_pc_sampling_agent_configurations( */ typedef struct rocprofiler_pc_sampling_hw_id_v0_t { - uint64_t chiplet : 6; ///< chiplet index (3 bits allocated by the ROCr runtime) - uint64_t wave_id : 7; ///< wave slot index - uint64_t simd_id : 2; ///< SIMD index - uint64_t pipe_id : 4; ///< pipe index - uint64_t cu_or_wgp_id : 4; ///< Index of compute unit on GFX9 or workgroup processer on other - ///< architectures + uint64_t chiplet : 6; ///< chiplet index (3 bits allocated by the ROCr runtime) + uint64_t wave_id : 7; ///< wave slot index + uint64_t simd_id : 2; ///< SIMD index + uint64_t pipe_id : 4; ///< pipe index + uint64_t cu_or_wgp_id : 4; uint64_t shader_array_id : 1; ///< Shared array index uint64_t shader_engine_id : 5; ///< shared engine index uint64_t workgroup_id : 7; ///< thread_group index on GFX9, and workgroup index on GFX10+ @@ -215,6 +229,9 @@ typedef struct rocprofiler_pc_sampling_hw_id_v0_t uint64_t queue_id : 4; ///< queue id uint64_t microengine_id : 2; ///< ACE (microengine) index uint64_t reserved0 : 16; ///< Reserved for the future use + + /// @var cu_or_wgp_id + /// @brief Compute unit index on GFX9 or workgroup processor index on GFX10+. } rocprofiler_pc_sampling_hw_id_v0_t; /** @@ -242,7 +259,6 @@ typedef struct /// @ref code_object_id is equal to the ::ROCPROFILER_CODE_OBJECT_ID_NONE. } rocprofiler_pc_t; -// TODO: The definition of this struct might change over time. /** * @brief ROCProfiler Host-Trap PC Sampling Record. */ @@ -263,6 +279,239 @@ typedef struct rocprofiler_pc_sampling_record_host_trap_v0_t /// @brief API launch call id that matches dispatch ID } rocprofiler_pc_sampling_record_host_trap_v0_t; +/** + * @brief The header of the @ref rocprofiler_pc_sampling_record_stochastic_v0_t, indicating + * what fields of the @ref rocprofiler_pc_sampling_record_stochastic_v0_t instance are meaningful + * for the sample. + */ +typedef struct rocprofiler_pc_sampling_record_stochastic_header_t +{ + uint8_t has_memory_counter : 1; ///< pc sample provides memory counters information + ///< via ::rocprofiler_pc_sampling_memory_counters_t + uint8_t reserved_type : 7; +} rocprofiler_pc_sampling_record_stochastic_header_t; + +/** + * @brief Enumeration describing type of sampled issued instruction. + */ +typedef enum rocprofiler_pc_sampling_instruction_type_t +{ + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_NONE = 0, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_VALU, ///< vector ALU instruction + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_MATRIX, ///< matrix instruction + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_SCALAR, ///< scalar (memory) instruction + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_TEX, ///< texture memory instruction + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_LDS, ///< LDS memory instruction + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_LDS_DIRECT, ///< LDS direct memory instruction + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_FLAT, ///< flat memory instruction + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_EXPORT, ///< export instruction + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_MESSAGE, ///< message instruction + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BARRIER, ///< barrier instruction + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BRANCH_NOT_TAKEN, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BRANCH_TAKEN, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_JUMP, ///< jump instruction + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_OTHER, ///< other types of instruction + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_NO_INST, ///< no instruction issued + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_DUAL_VALU, /// dual VALU instruction + ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_LAST + + /// @var ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BRANCH_NOT_TAKEN + /// @brief Instruction representing a branch not being taken. + /// @var ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BRANCH_TAKEN + /// @brief Instruction representing a taken branch. +} rocprofiler_pc_sampling_instruction_type_t; + +/** + * @brief Enumeration describing reason for not issuing an instruction. + */ +typedef enum rocprofiler_pc_sampling_instruction_not_issued_reason_t +{ + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NONE = 0, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NO_INSTRUCTION_AVAILABLE, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ALU_DEPENDENCY, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_WAITCNT, ///< waitcnt dependency + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_INTERNAL_INSTRUCTION, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_BARRIER_WAIT, ///< waiting on a barrier + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_NOT_WIN, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_WIN_EX_STALL, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_OTHER_WAIT, + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_SLEEP_WAIT, ///< wave was sleeping + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_LAST + + /// @var ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NO_INSTRUCTION_AVAILABLE + /// @brief No instruction available in the instruction cache. + /// @var ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ALU_DEPENDENCY + /// @brief ALU dependency not resolved. + /// @var ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_INTERNAL_INSTRUCTION + /// @brief Wave executes an internal instruction. + /// @var ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_NOT_WIN + /// @brief The instruction did not win the arbiter. + /// @var ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_WIN_EX_STALL + /// @brief Arbiter issued an instruction, but the execution pipe pushed it back from execution. + /// @var ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_OTHER_WAIT + /// @brief Other types of wait (e.g., wait for XNACK acknowledgment). + +} rocprofiler_pc_sampling_instruction_not_issued_reason_t; + +/** + * @brief Data provided by stochastic sampling hardware. + * + */ +typedef struct rocprofiler_pc_sampling_snapshot_v0_t +{ + uint32_t reason_not_issued : 4; + uint32_t reserved0 : 1; ///< reserved for future use + uint32_t arb_state_issue_valu : 1; ///< arbiter issued a VALU instruction + uint32_t arb_state_issue_matrix : 1; ///< arbiter issued a matrix instruction + uint32_t arb_state_issue_lds : 1; ///< arbiter issued a LDS instruction + uint32_t arb_state_issue_lds_direct : 1; ///< arbiter issued a LDS direct instruction + uint32_t arb_state_issue_scalar : 1; ///< arbiter issued a scalar (SALU/SMEM) instruction + uint32_t arb_state_issue_vmem_tex : 1; ///< arbiter issued a texture instruction + uint32_t arb_state_issue_flat : 1; ///< arbiter issued a FLAT instruction + uint32_t arb_state_issue_exp : 1; ///< arbiter issued a export instruction + uint32_t arb_state_issue_misc : 1; ///< arbiter issued a miscellaneous instruction + uint32_t arb_state_issue_brmsg : 1; ///< arbiter issued a branch/message instruction + uint32_t arb_state_issue_reserved : 1; ///< reserved for the future use + uint32_t arb_state_stall_valu : 1; + uint32_t arb_state_stall_matrix : 1; ///< matrix instruction was stalled + uint32_t arb_state_stall_lds : 1; ///< LDS instruction was stalled + uint32_t arb_state_stall_lds_direct : 1; ///< LDS direct instruction was stalled + uint32_t arb_state_stall_scalar : 1; ///< Scalar (SALU/SMEM) instruction was stalled + uint32_t arb_state_stall_vmem_tex : 1; ///< texture instruction was stalled + uint32_t arb_state_stall_flat : 1; ///< flat instruction was stalled + uint32_t arb_state_stall_exp : 1; ///< export instruction was stalled + uint32_t arb_state_stall_misc : 1; ///< miscellaneous instruction was stalled + uint32_t arb_state_stall_brmsg : 1; ///< branch/message instruction was stalled + uint32_t arb_state_state_reserved : 1; ///< reserved for the future use + // We have two reserved bits + uint32_t dual_issue_valu : 1; + uint32_t reserved1 : 1; ///< reserved for the future use + uint32_t reserved2 : 3; ///< reserved for the future use + + /// @var reason_not_issued + /// @brief The reason for not issuing an instruction. The field takes one of the value defined + /// in @ref ::rocprofiler_pc_sampling_instruction_not_issued_reason_t + /// @var arb_state_stall_valu + /// @brief VALU instruction was stalled when a sample was generated + /// @var dual_issue_valu + /// @brief Two VALU instructions were issued for coexecution (MI3xx specific) +} rocprofiler_pc_sampling_snapshot_v0_t; + +/** + * @brief Counters of issued but not yet completed instructions. + */ +typedef struct rocprofiler_pc_sampling_memory_counters_t +{ + uint32_t load_cnt : 6; + uint32_t store_cnt : 6; + uint32_t bvh_cnt : 3; + uint32_t sample_cnt : 6; + uint32_t ds_cnt : 6; + uint32_t km_cnt : 5; + + /// @var load_cnt + /// @brief Counts the number of VMEM load instructions issued but not yet completed. + /// @var store_cnt + /// @brief Counts the number of VMEM store instructions issued but not yet completed. + /// @var bvh_cnt + /// @brief Counts the number of VMEM BVH instructions issued but not yet completed. + /// @var sample_cnt + /// @brief Counts the number of VMEM sample instructions issued but not yet completed. + /// @var ds_cnt + /// @brief Counts the number of LDS instructions issued but not yet completed. + /// @var km_cnt + /// @brief Counts the number of scalar memory reads and memory instructions issued but not yet + /// completed. +} rocprofiler_pc_sampling_memory_counters_t; + +/** + * @brief ROCProfiler Stochastic PC Sampling Record. + */ +typedef struct rocprofiler_pc_sampling_record_stochastic_v0_t +{ + uint64_t size; ///< Size of this struct + rocprofiler_pc_sampling_record_stochastic_header_t flags; + uint8_t wave_in_group; + uint8_t wave_issued : 1; + uint8_t inst_type : 5; + uint8_t reserved : 2; + rocprofiler_pc_sampling_hw_id_v0_t hw_id; + rocprofiler_pc_t pc; + uint64_t exec_mask; + rocprofiler_dim3_t workgroup_id; + uint32_t wave_count; + uint64_t timestamp; + uint64_t dispatch_id; + rocprofiler_async_correlation_id_t correlation_id; + rocprofiler_pc_sampling_snapshot_v0_t snapshot; + rocprofiler_pc_sampling_memory_counters_t memory_counters; + + /// @var flags + /// @brief Defines what fields are meaningful for the sample. + /// @var wave_in_group + /// @brief wave position within the workgroup (0-15) + /// @var wave_issued + /// @brief wave issued the instruction represented with the PC + /// @var inst_type + /// @brief instruction type, takes a value defined in @ref + /// ::rocprofiler_pc_sampling_instruction_type_t + /// @var reserved + /// @brief reserved 2 bits must be zero + /// @var hw_id + /// @brief @see ::rocprofiler_pc_sampling_hw_id_v0_t + /// @var pc + /// @brief information about sampled program counter + /// @var exec_mask + /// @brief active SIMD lanes at the moment of sampling + /// @var workgroup_id + /// @brief wave coordinates within the workgroup + /// @var wave_count + /// @brief active waves on the CU at the moment of sampling + /// @var timestamp + /// @brief timestamp when sample is generated + /// @var dispatch_id + /// @brief originating kernel dispatch ID + /// @var correlation_id + /// @brief API launch call id that matches dispatch ID + /// @var snapshot + /// @brief Data provided by stochastic sampling hardware. @see + /// ::rocprofiler_pc_sampling_snapshot_v0_t + /// @var memory_counters + /// @brief Counters of issued but not yet completed instructions. @see + /// ::rocprofiler_pc_sampling_memory_counters_t +} rocprofiler_pc_sampling_record_stochastic_v0_t; + +/** + * @brief Record representing an invalid PC Sampling Record. + */ +typedef struct rocprofiler_pc_sampling_record_invalid_t +{ + uint64_t size; ///< Size of the struct +} rocprofiler_pc_sampling_record_invalid_t; + +/** + * @fn C compatible string representation of the PC sampling instruction type + * @brief Return the string encoding of @ref rocprofiler_pc_sampling_instruction_type_t value + * @param [in] instruction_type instruction type enum value + * @return Will return a nullptr if invalid/unsupported @ref + * rocprofiler_pc_sampling_instruction_type_t value is provided. + */ +const char* +rocprofiler_get_pc_sampling_instruction_type_name( + rocprofiler_pc_sampling_instruction_type_t instruction_type) ROCPROFILER_API; + +/** + * @fn C compatible string representation of reason for not issuing an instruciton + * @brief Return the string encoding of @ref rocprofiler_pc_sampling_instruction_not_issued_reason_t + * value + * @param [in] not_issued_reason no issue reason enum value + * @return Will return a nullptr if invalid/unsupported @ref + * rocprofiler_pc_sampling_instruction_not_issued_reason_t value is provided. + */ +const char* +rocprofiler_get_pc_sampling_instruction_not_issued_reason_name( + rocprofiler_pc_sampling_instruction_not_issued_reason_t not_issued_reason) ROCPROFILER_API; + /** @} */ ROCPROFILER_EXTERN_C_FINI diff --git a/projects/rocprofiler-sdk/source/lib/output/buffered_output.hpp b/projects/rocprofiler-sdk/source/lib/output/buffered_output.hpp index 22bb21da02..0cb4ecfa58 100644 --- a/projects/rocprofiler-sdk/source/lib/output/buffered_output.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/buffered_output.hpp @@ -168,5 +168,8 @@ using kernel_dispatch_buffered_output_with_stream_t = domain_type::KERNEL_DISPATCH>; using memory_copy_buffered_output_with_stream_t = buffered_output; +using pc_sampling_stochastic_buffered_output_t = + buffered_output; } // namespace tool } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/output/csv.hpp b/projects/rocprofiler-sdk/source/lib/output/csv.hpp index 8caee07eea..2ea467b5d6 100644 --- a/projects/rocprofiler-sdk/source/lib/output/csv.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/csv.hpp @@ -111,6 +111,7 @@ using stats_csv_encoder = csv_encoder<8>; using pc_sampling_host_trap_csv_encoder = csv_encoder<6>; using kernel_trace_with_stream_csv_encoder = csv_encoder<19>; using memory_copy_with_stream_csv_encoder = csv_encoder<8>; +using pc_sampling_stochastic_csv_encoder = csv_encoder<10>; } // namespace csv } // namespace tool } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/output/domain_type.cpp b/projects/rocprofiler-sdk/source/lib/output/domain_type.cpp index 3cabffc150..5b4f295d90 100644 --- a/projects/rocprofiler-sdk/source/lib/output/domain_type.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/domain_type.cpp @@ -63,6 +63,10 @@ DEFINE_BUFFER_TYPE_NAME(PC_SAMPLING_HOST_TRAP, "pc_sampling_host_trap_stats") DEFINE_BUFFER_TYPE_NAME(ROCDECODE, "ROCDECODE_API", "rocdecode_api_trace", "rocdecode_api_stats") DEFINE_BUFFER_TYPE_NAME(ROCJPEG, "ROCJPEG_API", "rocjpeg_api_trace", "rocjpeg_api_stats") +DEFINE_BUFFER_TYPE_NAME(PC_SAMPLING_STOCHASTIC, + "PC_SAMPLING_STOCHASTIC", + "pc_sampling_stochastic", + "pc_sampling_stochastic_stats") #undef DEFINE_BUFFER_TYPE_NAME diff --git a/projects/rocprofiler-sdk/source/lib/output/domain_type.hpp b/projects/rocprofiler-sdk/source/lib/output/domain_type.hpp index 5351857a8e..016ea67461 100644 --- a/projects/rocprofiler-sdk/source/lib/output/domain_type.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/domain_type.hpp @@ -39,6 +39,7 @@ enum class domain_type PC_SAMPLING_HOST_TRAP, ROCDECODE, ROCJPEG, + PC_SAMPLING_STOCHASTIC, LAST, }; diff --git a/projects/rocprofiler-sdk/source/lib/output/generateCSV.cpp b/projects/rocprofiler-sdk/source/lib/output/generateCSV.cpp index 99a0c18fbf..cb75b9b550 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateCSV.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateCSV.cpp @@ -877,6 +877,79 @@ generate_csv(const output_config& c } } +void +generate_csv(const output_config& cfg, + const metadata& tool_metadata, + const generator& data, + const stats_entry_t& stats) +{ + if(data.empty()) return; + + if(cfg.stats && stats) + write_stats(get_stats_output_file(cfg, domain_type::PC_SAMPLING_STOCHASTIC), stats.entries); + + auto ofs = tool::csv_output_file{cfg, + domain_type::PC_SAMPLING_STOCHASTIC, + tool::csv::pc_sampling_stochastic_csv_encoder{}, + { + "Sample_Timestamp", + "Exec_Mask", + "Dispatch_Id", + "Instruction", + "Instruction_Comment", + "Correlation_Id", + "Wave_Issued_Instruction", + "Instruction_Type", + "Stall_Reason", + "Wave_Count", + }}; + for(auto ditr : data) + { + for(const auto& record : data.get(ditr)) + { + std::string inst; + std::string inst_comment; + if(record.inst_index == -1) + { + // A sample originates from a blit kernel or self-modifying code, + // so instruction cannot be decoded + inst_comment = "Unrecognized code object id, physical virtual address of PC:" + + std::to_string(record.pc_sample_record.pc.code_object_offset); + } + else + { + // Provide decoded instruction and comment + inst = tool_metadata.get_instruction(record.inst_index); + inst_comment = tool_metadata.get_comment(record.inst_index); + } + + auto row_ss = std::stringstream{}; + rocprofiler::tool::csv::pc_sampling_stochastic_csv_encoder::write_row( + row_ss, + record.pc_sample_record.timestamp, + record.pc_sample_record.exec_mask, + record.pc_sample_record.dispatch_id, + inst, + inst_comment, + record.pc_sample_record.correlation_id.internal, + // As wave_issued is uint8_t of size 1, it can be dumped as char. + // To prevent that, explicitly cast it to integer, so that CSV output + // shows human-readable 0/1 values. + static_cast(record.pc_sample_record.wave_issued), + std::string(rocprofiler_get_pc_sampling_instruction_type_name( + static_cast( + record.pc_sample_record.inst_type))), + std::string(rocprofiler_get_pc_sampling_instruction_not_issued_reason_name( + static_cast( + record.pc_sample_record.snapshot.reason_not_issued))), + // Similar reasoning as for wave_issued. + static_cast(record.pc_sample_record.wave_count)); + + ofs << row_ss.str(); + } + } +} + void generate_csv(const output_config& cfg, const metadata& /*tool_metadata*/, diff --git a/projects/rocprofiler-sdk/source/lib/output/generateCSV.hpp b/projects/rocprofiler-sdk/source/lib/output/generateCSV.hpp index ec6c8d4a5b..b1c6d6e3e8 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateCSV.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateCSV.hpp @@ -110,6 +110,12 @@ generate_csv(const output_config& c const generator& data, const stats_entry_t& stats); +void +generate_csv(const output_config& cfg, + const metadata& tool_metadata, + const generator& data, + const stats_entry_t& stats); + void generate_csv(const output_config& cfg, const metadata& tool_metadata, diff --git a/projects/rocprofiler-sdk/source/lib/output/generateJSON.cpp b/projects/rocprofiler-sdk/source/lib/output/generateJSON.cpp index f88ddfbbab..f87870ea43 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateJSON.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateJSON.cpp @@ -196,10 +196,10 @@ write_json(json_output& json_ar, generator scratch_memory_gen, generator rccl_api_gen, generator memory_allocation_gen, - generator pc_sampling_gen, generator rocdecode_api_gen, - generator rocjpeg_api_gen) - + generator rocjpeg_api_gen, + generator pc_sampling_host_trap_gen, + generator pc_sampling_stochastic_gen) { // summary { @@ -239,9 +239,10 @@ write_json(json_output& json_ar, json_ar(cereal::make_nvp("memory_copy", memory_copy_gen)); json_ar(cereal::make_nvp("memory_allocation", memory_allocation_gen)); json_ar(cereal::make_nvp("scratch_memory", scratch_memory_gen)); - json_ar(cereal::make_nvp("pc_sample_host_trap", pc_sampling_gen)); json_ar(cereal::make_nvp("rocdecode_api", rocdecode_api_gen)); json_ar(cereal::make_nvp("rocjpeg_api", rocjpeg_api_gen)); + json_ar(cereal::make_nvp("pc_sample_host_trap", pc_sampling_host_trap_gen)); + json_ar(cereal::make_nvp("pc_sample_stochastic", pc_sampling_stochastic_gen)); json_ar.finishNode(); } } diff --git a/projects/rocprofiler-sdk/source/lib/output/generateJSON.hpp b/projects/rocprofiler-sdk/source/lib/output/generateJSON.hpp index 3c3dd32ad6..1b9343e9c4 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateJSON.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateJSON.hpp @@ -94,9 +94,9 @@ write_json(json_output& j generator scratch_memory_gen, generator rccl_api_gen, generator memory_allocation_gen, - generator pc_sampling_gen, generator rocdecode_api_gen, - generator rocjpeg_api_gen); - + generator rocjpeg_api_gen, + generator pc_sampling_host_trap_gen, + generator pc_sampling_stochastic_gen); } // namespace tool } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/output/generateStats.cpp b/projects/rocprofiler-sdk/source/lib/output/generateStats.cpp index 39ef5a3c3e..541409fd56 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateStats.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateStats.cpp @@ -453,7 +453,23 @@ generate_stats(const output_config& /* cfg*/, const metadata& /*tool_metadata*/, const generator& /*data*/) { + // TODO: + // 1. Implement serialization for PC sampling stats. + // The format differs significantly from tracing stats. + // 2. Decide what is going to be part of the stats. + // Some basic information is already available in the tool_metadata.pc_sampling_stats. + // This contains the total number of valid VS invalid samples. return stats_entry_t{}; } + +stats_entry_t +generate_stats(const output_config& /* cfg*/, + const metadata& /*tool_metadata*/, + const generator& /*data*/) +{ + // TODO: sames TODOS from the function above applies here. + return stats_entry_t{}; +} + } // namespace tool } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/output/generateStats.hpp b/projects/rocprofiler-sdk/source/lib/output/generateStats.hpp index a38271a386..386f94fe67 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateStats.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateStats.hpp @@ -90,6 +90,12 @@ stats_entry_t generate_stats(const output_config& cfg, const metadata& tool_metadata, const generator& data); + +stats_entry_t +generate_stats(const output_config& cfg, + const metadata& tool_metadata, + const generator& data); + void generate_stats(const output_config& cfg, const metadata& tool_metadata, diff --git a/projects/rocprofiler-sdk/source/lib/output/metadata.hpp b/projects/rocprofiler-sdk/source/lib/output/metadata.hpp index 86ff372011..43f5dbf27c 100644 --- a/projects/rocprofiler-sdk/source/lib/output/metadata.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/metadata.hpp @@ -87,6 +87,9 @@ using att_filenames_map_t = std::unordered_map; template using synced_map = common::Synchronized; +template +using synced_obj = common::Synchronized; +using pc_sampling_stats_t = rocprofiler_tool_pc_sampling_stats; enum class agent_indexing { @@ -133,6 +136,7 @@ struct metadata synced_map host_functions = {}; synced_map code_object_load = {}; att_filenames_map_t att_filenames = {}; + synced_obj pc_sampling_stats = {}; metadata() = default; metadata(inprocess); diff --git a/projects/rocprofiler-sdk/source/lib/output/pc_sample_transform.hpp b/projects/rocprofiler-sdk/source/lib/output/pc_sample_transform.hpp index 6b77f1ed89..b7e092962c 100644 --- a/projects/rocprofiler-sdk/source/lib/output/pc_sample_transform.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/pc_sample_transform.hpp @@ -23,6 +23,7 @@ #pragma once #include +#include #include #include @@ -79,5 +80,32 @@ struct rocprofiler_tool_pc_sampling_host_trap_record_t } }; +// TODO:: Check if we can template this structure +struct rocprofiler_tool_pc_sampling_stochastic_record_t +{ + rocprofiler_pc_sampling_record_stochastic_v0_t pc_sample_record; + int64_t inst_index; + + rocprofiler_tool_pc_sampling_stochastic_record_t( + rocprofiler_pc_sampling_record_stochastic_v0_t record, + int64_t index) + : pc_sample_record(record) + , inst_index(index) + {} + + template + void save(ArchiveT& ar) const + { + ar(cereal::make_nvp("record", pc_sample_record)); + ar(cereal::make_nvp("inst_index", inst_index)); + } +}; + +struct rocprofiler_tool_pc_sampling_stats +{ + uint64_t valid_samples = 0; + uint64_t invalid_samples = 0; +}; + } // namespace tool } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp index 15e839d15d..fdebf3d8bf 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp @@ -285,10 +285,28 @@ config::config() {"stochastic", ROCPROFILER_PC_SAMPLING_METHOD_STOCHASTIC}, {"host_trap", ROCPROFILER_PC_SAMPLING_METHOD_HOST_TRAP}}; - pc_sampling_method_value = pc_sampling_method_map.at(pc_sampling_method); + try + { + pc_sampling_method_value = pc_sampling_method_map.at(pc_sampling_method); + } catch(...) + { + ROCP_FATAL << "Invalid value for ROCPROF_PC_SAMPLING_METHOD: " << pc_sampling_method << "." + << "Valid choices are stochastic and host_trap\n"; + } + if(pc_sampling_method_value == ROCPROFILER_PC_SAMPLING_METHOD_HOST_TRAP) pc_sampling_host_trap = true; - pc_sampling_unit_value = pc_sampling_unit_map.at(pc_sampling_unit); + else if(pc_sampling_method_value == ROCPROFILER_PC_SAMPLING_METHOD_STOCHASTIC) + pc_sampling_stochastic = true; + + try + { + pc_sampling_unit_value = pc_sampling_unit_map.at(pc_sampling_unit); + } catch(...) + { + ROCP_FATAL << "Invalid value for ROCPROF_PC_SAMPLING_UNIT: " << pc_sampling_unit << "." + << "Valid choices are instructions, cycles and time\n"; + } if(auto _collection_period = get_env("ROCPROF_COLLECTION_PERIOD", ""); !_collection_period.empty()) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp index 3fe697cd18..3f40d0d15d 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp @@ -116,6 +116,7 @@ struct config : output_config bool list_metrics_output_file = get_env("ROCPROF_OUTPUT_LIST_METRICS_FILE", false); bool pc_sampling_host_trap = false; bool advanced_thread_trace = get_env("ROCPROF_ADVANCED_THREAD_TRACE", false); + bool pc_sampling_stochastic = false; size_t pc_sampling_interval = get_env("ROCPROF_PC_SAMPLING_INTERVAL", 1); bool att_serialize_all = get_env("ROCPROF_ATT_PARAM_SERIALIZE_ALL", false); rocprofiler_pc_sampling_method_t pc_sampling_method_value = ROCPROFILER_PC_SAMPLING_METHOD_NONE; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp index b97560e53d..87352c623a 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp @@ -147,10 +147,11 @@ struct buffer_ids rocprofiler_buffer_id_t pc_sampling_host_trap = {}; rocprofiler_buffer_id_t rocdecode_api_trace = {}; rocprofiler_buffer_id_t rocjpeg_api_trace = {}; + rocprofiler_buffer_id_t pc_sampling_stochastic = {}; auto as_array() const { - return std::array{hsa_api_trace, + return std::array{hsa_api_trace, hip_api_trace, kernel_trace, memory_copy_trace, @@ -160,7 +161,8 @@ struct buffer_ids rccl_api_trace, pc_sampling_host_trap, rocdecode_api_trace, - rocjpeg_api_trace}; + rocjpeg_api_trace, + pc_sampling_stochastic}; } }; @@ -726,7 +728,8 @@ code_object_tracing_callback(rocprofiler_callback_tracing_record_t record, auto* obj_data = static_cast(record.payload); CHECK_NOTNULL(tool_metadata)->add_code_object(*obj_data); - if(tool::get_config().pc_sampling_host_trap) + if(tool::get_config().pc_sampling_host_trap || + tool::get_config().pc_sampling_stochastic) { CHECK_NOTNULL(tool_metadata)->add_decoder(obj_data); } @@ -1178,6 +1181,10 @@ rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /* context_id*/, { if(!headers) return; + // count number of valid VS invalid samples delivered by this callback + uint64_t valid_samples_cnt = 0; + uint64_t invalid_samples_cnt = 0; + for(size_t i = 0; i < num_headers; i++) { auto* cur_header = headers[i]; @@ -1202,6 +1209,25 @@ rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /* context_id*/, rocprofiler::tool::write_ring_buffer(pc_sample_tool_record, domain_type::PC_SAMPLING_HOST_TRAP); + + valid_samples_cnt++; + } + else if(cur_header->kind == ROCPROFILER_PC_SAMPLING_RECORD_STOCHASTIC_V0_SAMPLE) + { + auto* pc_sample = static_cast( + cur_header->payload); + + auto pc_sample_tool_record = + rocprofiler::tool::rocprofiler_tool_pc_sampling_stochastic_record_t( + *pc_sample, get_instruction_index(pc_sample->pc)); + + rocprofiler::tool::write_ring_buffer(pc_sample_tool_record, + domain_type::PC_SAMPLING_STOCHASTIC); + valid_samples_cnt++; + } + else if(cur_header->kind == ROCPROFILER_PC_SAMPLING_RECORD_INVALID_SAMPLE) + { + invalid_samples_cnt++; } } else @@ -1209,6 +1235,13 @@ rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /* context_id*/, ROCP_FATAL << "unexpected rocprofiler_record_header_t category + kind"; } } + + // sum up number of valid/invalid samples for pc sampling stats + tool_metadata->pc_sampling_stats.wlock( + [valid_samples_cnt, invalid_samples_cnt](auto& pc_sampling_stats) { + pc_sampling_stats.valid_samples += valid_samples_cnt; + pc_sampling_stats.invalid_samples += invalid_samples_cnt; + }); } void @@ -1377,6 +1410,52 @@ if_pc_sample_config_match(rocprofiler_agent_id_t agent_id, return false; } +void +configure_pc_sampling_on_all_agents(uint64_t buffer_size, + uint64_t buffer_watermark, + void* tool_data) +{ + auto method = tool::get_config().pc_sampling_method_value; + auto unit = tool::get_config().pc_sampling_unit_value; + + // Find the proper buffer_id based on the method + auto* buffer_id = (method == ROCPROFILER_PC_SAMPLING_METHOD_HOST_TRAP) + ? &get_buffers().pc_sampling_host_trap + : &get_buffers().pc_sampling_stochastic; + + ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(), + buffer_size, + buffer_watermark, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + rocprofiler_pc_sampling_callback, + tool_data, + buffer_id), + "buffer creation"); + + bool config_match_found = false; + auto agent_ptr_vec = get_gpu_agents(); + for(auto& itr : agent_ptr_vec) + { + if(if_pc_sample_config_match( + itr->id, method, unit, tool::get_config().pc_sampling_interval)) + { + config_match_found = true; + int flags = 0; + ROCPROFILER_CALL( + rocprofiler_configure_pc_sampling_service(get_client_ctx(), + itr->id, + method, + unit, + tool::get_config().pc_sampling_interval, + *buffer_id, + flags), + "configure PC sampling"); + } + } + if(!config_match_found) + ROCP_FATAL << "Given PC sampling configuration is not supported on any of the agents"; +} + int tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) { @@ -1745,38 +1824,11 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) if(tool::get_config().pc_sampling_host_trap) { - ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(), - buffer_size, - buffer_watermark, - ROCPROFILER_BUFFER_POLICY_LOSSLESS, - rocprofiler_pc_sampling_callback, - tool_data, - &get_buffers().pc_sampling_host_trap), - "buffer creation"); - bool config_match_found = false; - auto agent_ptr_vec = get_gpu_agents(); - for(auto& itr : agent_ptr_vec) - { - auto method = tool::get_config().pc_sampling_method_value; - auto unit = tool::get_config().pc_sampling_unit_value; - if(if_pc_sample_config_match( - itr->id, method, unit, tool::get_config().pc_sampling_interval)) - { - config_match_found = true; - int flags = 0; - ROCPROFILER_CALL(rocprofiler_configure_pc_sampling_service( - get_client_ctx(), - itr->id, - method, - unit, - tool::get_config().pc_sampling_interval, - get_buffers().pc_sampling_host_trap, - flags), - "configure PC sampling"); - } - } - if(!config_match_found) - ROCP_FATAL << "Given PC sampling configuration is not supported on any of the agents"; + configure_pc_sampling_on_all_agents(buffer_size, buffer_watermark, tool_data); + } + else if(tool::get_config().pc_sampling_stochastic) + { + configure_pc_sampling_on_all_agents(buffer_size, buffer_watermark, tool_data); } for(auto itr : get_buffers().as_array()) @@ -1897,6 +1949,8 @@ tool_fini(void* /*tool_data*/) auto rocdecode_output = tool::rocdecode_buffered_output_t{tool::get_config().rocdecode_api_trace}; auto rocjpeg_output = tool::rocjpeg_buffered_output_t{tool::get_config().rocjpeg_api_trace}; + auto pc_sampling_stochastic_output = + tool::pc_sampling_stochastic_buffered_output_t{tool::get_config().pc_sampling_stochastic}; auto node_id_sort = [](const auto& lhs, const auto& rhs) { return lhs.node_id < rhs.node_id; }; auto agents_output = CHECK_NOTNULL(tool_metadata)->agents; @@ -1917,6 +1971,7 @@ tool_fini(void* /*tool_data*/) generate_output(rocdecode_output, num_output, contributions); generate_output(pc_sampling_host_trap_output, num_output, contributions); generate_output(rocjpeg_output, num_output, contributions); + generate_output(pc_sampling_stochastic_output, num_output, contributions); if(tool::get_config().advanced_thread_trace && !tool::get_config().att_capability.empty() && !tool_metadata->att_filenames.empty()) @@ -1955,9 +2010,10 @@ tool_fini(void* /*tool_data*/) scratch_memory_output.get_generator(), rccl_output.get_generator(), memory_allocation_output.get_generator(), - pc_sampling_host_trap_output.get_generator(), rocdecode_output.get_generator(), - rocjpeg_output.get_generator()); + rocjpeg_output.get_generator(), + pc_sampling_host_trap_output.get_generator(), + pc_sampling_stochastic_output.get_generator()); json_ar.finish_process(); tool::close_json(json_ar); @@ -2074,6 +2130,7 @@ tool_fini(void* /*tool_data*/) destroy_output(pc_sampling_host_trap_output); destroy_output(rocdecode_output); destroy_output(rocjpeg_output); + destroy_output(pc_sampling_stochastic_output); if(kernel_rename_and_stream_display_pair_dtors != nullptr) { diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling.cpp index 3432df66e9..e20bf4842d 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling.cpp @@ -35,6 +35,87 @@ namespace { +#define ROCPROFILER_INSTRUCTION_TYPE_STRING(CODE) \ + template <> \ + struct instruction_type_string \ + { \ + static constexpr auto name = #CODE; \ + }; + +#define ROCPROFILER_NO_ISSUE_REASON_STRING(CODE) \ + template <> \ + struct no_issue_reason_string \ + { \ + static constexpr auto name = #CODE; \ + }; + +template +struct instruction_type_string; + +template +struct no_issue_reason_string; + +ROCPROFILER_INSTRUCTION_TYPE_STRING(ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_NONE); +ROCPROFILER_INSTRUCTION_TYPE_STRING(ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_VALU); +ROCPROFILER_INSTRUCTION_TYPE_STRING(ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_MATRIX); +ROCPROFILER_INSTRUCTION_TYPE_STRING(ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_SCALAR); +ROCPROFILER_INSTRUCTION_TYPE_STRING(ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_TEX); +ROCPROFILER_INSTRUCTION_TYPE_STRING(ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_LDS); +ROCPROFILER_INSTRUCTION_TYPE_STRING(ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_LDS_DIRECT); +ROCPROFILER_INSTRUCTION_TYPE_STRING(ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_FLAT); +ROCPROFILER_INSTRUCTION_TYPE_STRING(ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_EXPORT); +ROCPROFILER_INSTRUCTION_TYPE_STRING(ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_MESSAGE); +ROCPROFILER_INSTRUCTION_TYPE_STRING(ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BARRIER); +ROCPROFILER_INSTRUCTION_TYPE_STRING(ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BRANCH_NOT_TAKEN); +ROCPROFILER_INSTRUCTION_TYPE_STRING(ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BRANCH_TAKEN); +ROCPROFILER_INSTRUCTION_TYPE_STRING(ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_JUMP); +ROCPROFILER_INSTRUCTION_TYPE_STRING(ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_OTHER); +ROCPROFILER_INSTRUCTION_TYPE_STRING(ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_NO_INST); +ROCPROFILER_INSTRUCTION_TYPE_STRING(ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_DUAL_VALU); + +ROCPROFILER_NO_ISSUE_REASON_STRING(ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NONE); +ROCPROFILER_NO_ISSUE_REASON_STRING( + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NO_INSTRUCTION_AVAILABLE); +ROCPROFILER_NO_ISSUE_REASON_STRING( + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ALU_DEPENDENCY); +ROCPROFILER_NO_ISSUE_REASON_STRING(ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_WAITCNT); +ROCPROFILER_NO_ISSUE_REASON_STRING( + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_INTERNAL_INSTRUCTION); +ROCPROFILER_NO_ISSUE_REASON_STRING( + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_BARRIER_WAIT); +ROCPROFILER_NO_ISSUE_REASON_STRING( + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_NOT_WIN); +ROCPROFILER_NO_ISSUE_REASON_STRING( + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_WIN_EX_STALL); +ROCPROFILER_NO_ISSUE_REASON_STRING( + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_OTHER_WAIT); +ROCPROFILER_NO_ISSUE_REASON_STRING( + ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_SLEEP_WAIT); + +template +const char* +get_instruction_type_name(rocprofiler_pc_sampling_instruction_type_t instruction_type, + std::index_sequence) +{ + if(instruction_type == Idx) return instruction_type_string::name; + // recursion until tail empty + if constexpr(sizeof...(Tail) > 0) + return get_instruction_type_name(instruction_type, std::index_sequence{}); + return nullptr; +} + +template +const char* +get_no_issue_reason_name(rocprofiler_pc_sampling_instruction_not_issued_reason_t no_issue_reason, + std::index_sequence) +{ + if(no_issue_reason == Idx) return no_issue_reason_string::name; + // recursion until tail empty + if constexpr(sizeof...(Tail) > 0) + return get_no_issue_reason_name(no_issue_reason, std::index_sequence{}); + return nullptr; +} + /** * @brief The functions checks if the `ROCPROFILER_PC_SAMPLING_BETA_ENABLED` is set. * If so, it will enable PC sampling API. Otherwise, the API is reported @@ -130,4 +211,22 @@ rocprofiler_query_pc_sampling_agent_configurations( return ROCPROFILER_STATUS_ERROR_NOT_AVAILABLE; #endif } + +const char* +rocprofiler_get_pc_sampling_instruction_type_name( + rocprofiler_pc_sampling_instruction_type_t instruction_type) +{ + return get_instruction_type_name( + instruction_type, + std::make_index_sequence{}); +} + +const char* +rocprofiler_get_pc_sampling_instruction_not_issued_reason_name( + rocprofiler_pc_sampling_instruction_not_issued_reason_t not_issued_reason) +{ + return get_no_issue_reason_name( + not_issued_reason, + std::make_index_sequence{}); +} } diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter.cpp index 48db72c893..accfd4d627 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter.cpp @@ -46,17 +46,46 @@ namespace { #define PC_SAMPLING_IOCTL_BITMASK 0xFFFF -/** - * @brief Used to determine the version of PC sampling - * IOCTL implementation in the driver. - * - * @todo Remove this once the KFD IOCTL is upstreamed - */ -struct pc_sampling_ioctl_version_t +#define PC_SAMPLING_IOCTL_COMPUTE_VERSION(major, minor) ROCPROFILER_COMPUTE_VERSION(major, minor, 0) + +using pcs_ioctl_version_t = uint32_t; + +#define KFD_ROCP_PCS_METHOD_PAIR(KFD_ENUM_VAL, ROCP_ENUM_VAL) \ + template <> \ + struct pcs_method_pair \ + { \ + static constexpr auto rocp_enum_val = ROCP_ENUM_VAL; \ + }; + +template +struct pcs_method_pair; + +KFD_ROCP_PCS_METHOD_PAIR(ROCPROFILER_IOCTL_PC_SAMPLING_METHOD_KIND_NONE, + ROCPROFILER_PC_SAMPLING_METHOD_NONE); +KFD_ROCP_PCS_METHOD_PAIR(ROCPROFILER_IOCTL_PC_SAMPLING_METHOD_KIND_HOSTTRAP_V1, + ROCPROFILER_PC_SAMPLING_METHOD_HOST_TRAP); +KFD_ROCP_PCS_METHOD_PAIR(ROCPROFILER_IOCTL_PC_SAMPLING_METHOD_KIND_STOCHASTIC_V1, + ROCPROFILER_PC_SAMPLING_METHOD_STOCHASTIC); + +template +rocprofiler_pc_sampling_method_t +get_rocp_pcs_method(rocprofiler_ioctl_pc_sampling_method_kind_t kfd_method, + std::index_sequence) { - uint32_t major_version; /// PC sampling IOCTL major version - uint32_t minor_version; /// PC sampling IOCTL minor version -}; + if(kfd_method == Idx) return pcs_method_pair::rocp_enum_val; + // recursion until tail empty + if constexpr(sizeof...(Tail) > 0) + return get_rocp_pcs_method(kfd_method, std::index_sequence{}); + // Return none value if matching fails + return ROCPROFILER_PC_SAMPLING_METHOD_NONE; +} + +rocprofiler_pc_sampling_method_t +get_rocp_pcs_method_from_kfd(rocprofiler_ioctl_pc_sampling_method_kind_t kfd_method) +{ + return get_rocp_pcs_method( + kfd_method, std::make_index_sequence{}); +} int kfd_open() @@ -137,7 +166,7 @@ get_ioctl_version(rocprofiler_ioctl_version_info_t& ioctl_version) * @return ::rocprofiler_status_t */ rocprofiler_status_t -get_pc_sampling_ioctl_version(uint32_t kfd_gpu_id, pc_sampling_ioctl_version_t& pcs_ioctl_version) +get_pc_sampling_ioctl_version(uint32_t kfd_gpu_id, pcs_ioctl_version_t* pcs_ioctl_version) { struct kfd_ioctl_pc_sample_args args; args.op = KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES; @@ -172,29 +201,27 @@ get_pc_sampling_ioctl_version(uint32_t kfd_gpu_id, pc_sampling_ioctl_version_t& // `version` field contains PC Sampling IOCTL version auto version = args.version; // Lower 16 bits represent minor version - pcs_ioctl_version.minor_version = version & PC_SAMPLING_IOCTL_BITMASK; + auto minor_version = version & PC_SAMPLING_IOCTL_BITMASK; // Upper 16 bits represent major version - pcs_ioctl_version.major_version = (version >> 16) & PC_SAMPLING_IOCTL_BITMASK; + auto major_version = (version >> 16) & PC_SAMPLING_IOCTL_BITMASK; + // finally, compute the version + *pcs_ioctl_version = PC_SAMPLING_IOCTL_COMPUTE_VERSION(major_version, minor_version); return ROCPROFILER_STATUS_SUCCESS; } /** - * @brief Check if PC sampling is supported on the device with @p kfd_gpu_id. + * @brief Check if PC sampling feature is supported in KFD. * * Starting from KFD IOCTL 1.16, KFD delivers beta implementation of the PC sampling. - * Furthermore, ROCProfiler-SDK expects PC sampling IOCTL 0.1 version. - * @todo: Once KFD is upstreamed, ROCProfiler-SDK will rely only on KFD IOCTL version. * * @return ::rocprofiler_status_t * @retval ::ROCPROFILER_STATUS_SUCCESS PC sampling is supported in the driver. * Other values informs users about the reason why PC sampling is not supported. */ rocprofiler_status_t -is_pc_sampling_supported(const rocprofiler_agent_t* agent) +is_pc_sampling_supported() { - auto kfd_gpu_id = agent->gpu_id; - std::string_view agent_name = agent->name; // Verify KFD 1.16 version rocprofiler_ioctl_version_info_t ioctl_version = {.major_version = 0, .minor_version = 0}; auto status = get_ioctl_version(ioctl_version); @@ -208,58 +235,131 @@ is_pc_sampling_supported(const rocprofiler_agent_t* agent) return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL; } - // TODO: remove once KFD is upstreamed - // Verify PC sampling IOCTL version - pc_sampling_ioctl_version_t pcs_ioctl_version = {.major_version = 0, .minor_version = 0}; - status = get_pc_sampling_ioctl_version(kfd_gpu_id, pcs_ioctl_version); - if(status != ROCPROFILER_STATUS_SUCCESS) + // PC Sampling feature is supported in the driver. + return ROCPROFILER_STATUS_SUCCESS; +} + +/** + * @brief Check if PC sampling method is supported on the agent. + * + * The function complements the @ref is_pc_sampling_supported function. + * It introduces a strict check against the PC sampling IOCTL version + * that tells us whether a certain PC sampling method is safe to be used + * on the specific device architecture. + * + * @param method - PC sampling method to be checked + * @param agent - The agent to be checked + * @param pcs_ioctl_version - The PC sampling IOCTL version + * @return ::rocprofiler_status_t + * @retval ::ROCPROFILER_STATUS_SUCCESS - The method is supported + * Other values informs users about the reason why the method is not supported. + */ +rocprofiler_status_t +is_pc_sampling_method_supported(rocprofiler_pc_sampling_method_t method, + const rocprofiler_agent_t* agent, + pcs_ioctl_version_t pcs_ioctl_version) +{ + std::string_view agent_name = agent->name; + if(method == ROCPROFILER_PC_SAMPLING_METHOD_HOST_TRAP) { - // The reason for not emitting the "PC sampling unavailable" message is the following. - // Assume that all devices except one support PC sampling on the system. - // By emitting the message for that one device that doesn't support PC sampling, - // all tests and samples are skipped. Instead, tests and samples will ignore - // that one problematic device and continue using PC sampling on other devices - // that support this feature. - return status; + if(agent_name == "gfx90a") + { + // 0.1 version enables host-trap PC sampling on gfx90a + if(pcs_ioctl_version >= PC_SAMPLING_IOCTL_COMPUTE_VERSION(0, 1)) + return ROCPROFILER_STATUS_SUCCESS; + else + return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL; + } + else if(agent_name.find("gfx94") == 0) + { + // 0.3 version enables host-trap PC sampling on gfx940, gfx941, gfx942, etc. + if(pcs_ioctl_version >= PC_SAMPLING_IOCTL_COMPUTE_VERSION(0, 3)) + return ROCPROFILER_STATUS_SUCCESS; + else + return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL; + } + else if(agent_name.find("gfx95") == 0) + { + // 1.2 version enables host-trap PC sampling on gfx950 + if(pcs_ioctl_version >= PC_SAMPLING_IOCTL_COMPUTE_VERSION(1, 2)) + return ROCPROFILER_STATUS_SUCCESS; + else + return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL; + } } - else if(agent_name == "gfx90a") + else if(method == ROCPROFILER_PC_SAMPLING_METHOD_STOCHASTIC) { - // For gfx90a, we expect PC sampling IOCTL to be at least 0.1. - if(pcs_ioctl_version.major_version > 0 || pcs_ioctl_version.minor_version >= 1) - return ROCPROFILER_STATUS_SUCCESS; - else - return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL; - } - else if(agent_name.find("gfx94") == 0) - { - // We expect PC sampling IOCTL to be at least 0.3 for gfx940, gfx941, gfx942, etc. - if(pcs_ioctl_version.major_version > 0 || pcs_ioctl_version.minor_version >= 3) - return ROCPROFILER_STATUS_SUCCESS; - else - return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL; - } - else if(agent_name.find("gfx95") == 0) - { - // As I am not sure if the PCS IOCTL is going to be bumped for gfx950, - // I introduced a separate branch for it. - // We expect PC sampling IOCTL to be at least 0.3 for gfx950. - if(pcs_ioctl_version.major_version > 0 || pcs_ioctl_version.minor_version >= 3) - return ROCPROFILER_STATUS_SUCCESS; - else - return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL; + if(agent_name == "gfx90a") + { + // gfx90a doesn't support stochastic PC sampling + return ROCPROFILER_STATUS_ERROR_NOT_AVAILABLE; + } + else if(agent_name.find("gfx94") == 0) + { + // 1.3 version enables stochastic PC sampling on gfx940, gfx941, gfx942, etc. + if(pcs_ioctl_version >= PC_SAMPLING_IOCTL_COMPUTE_VERSION(1, 3)) + return ROCPROFILER_STATUS_SUCCESS; + else + return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL; + } + else if(agent_name.find("gfx95") == 0) + { + // 1.4 version enables stochastic PC sampling on gfx950 + if(pcs_ioctl_version >= PC_SAMPLING_IOCTL_COMPUTE_VERSION(1, 4)) + return ROCPROFILER_STATUS_SUCCESS; + else + return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL; + } } else { - // The agent does not support PC sampling. - return ROCPROFILER_STATUS_ERROR_NOT_AVAILABLE; + return ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT; } + + // Other architecture do not support the PC sampling method. + return ROCPROFILER_STATUS_ERROR_NOT_AVAILABLE; +} + +/** + * @brief Returns the PC sampling IOCTL version if the PC sampling feature is supported in the + * driver. + * + * First, check the minimal driver version via @ref is_pc_sampling_supported. + * Then, determines the PC sampling IOCTL version via @ref get_pc_sampling_ioctl_version. + * + * @param [in] kfd_gpu_id - The KFD GPU identifier + * @param [out] pcs_ioctl_version_t - The PC sampling IOCTL version + * @return ::rocprofiler_status_t + */ +rocprofiler_status_t +get_pcs_ioctl_version_if_kfd_supports(uint32_t kfd_gpu_id, pcs_ioctl_version_t* pcs_ioctl_version) +{ + // Check if the PC sampling feature is supported in the driver + auto status = is_pc_sampling_supported(); + if(status != ROCPROFILER_STATUS_SUCCESS) return status; + + // Get the PC sampling IOCTL version + status = get_pc_sampling_ioctl_version(kfd_gpu_id, pcs_ioctl_version); + return status; +} + +/** + * @brief Same as @ref is_pc_sampling_method_supported. + */ +rocprofiler_status_t +is_pc_sampling_method_supported(rocprofiler_ioctl_pc_sampling_method_kind_t ioctl_method, + const rocprofiler_agent_t* agent, + pcs_ioctl_version_t pcs_ioctl_version) +{ + auto rocp_method = get_rocp_pcs_method_from_kfd(ioctl_method); + return is_pc_sampling_method_supported(rocp_method, agent, pcs_ioctl_version); } /** * @kfd_gpu_id represents the gpu identifier read from the content of the * /sys/class/kfd/kfd/topology/nodes//gpu_id. */ -ROCPROFILER_IOCTL_STATUS +rocprofiler_ioctl_status_t ioctl_query_pc_sampling_capabilities(uint32_t kfd_gpu_id, void* sample_info, uint32_t sample_info_sz, @@ -366,8 +466,9 @@ get_kfd_fd() rocprofiler_status_t ioctl_query_pcs_configs(const rocprofiler_agent_t* agent, rocp_pcs_cfgs_vec_t& rocp_configs) { - if(auto status = is_pc_sampling_supported(agent); status != ROCPROFILER_STATUS_SUCCESS) - return status; + pcs_ioctl_version_t pcs_ioctl_version = 0; + auto status = get_pcs_ioctl_version_if_kfd_supports(agent->gpu_id, &pcs_ioctl_version); + if(status != ROCPROFILER_STATUS_SUCCESS) return status; uint32_t kfd_gpu_id = agent->gpu_id; @@ -400,8 +501,15 @@ ioctl_query_pcs_configs(const rocprofiler_agent_t* agent, rocp_pcs_cfgs_vec_t& r { // FIXME: Why this happens? if(ioctl_cfg.method == 0) continue; - // Skip showing stochastic sampling until it's fully supported. - if(ioctl_cfg.method == ROCPROFILER_IOCTL_PC_SAMPLING_METHOD_KIND_STOCHASTIC_V1) continue; + + // Strict check whether the driver version (safely) supports the sampling method for + // this specific device architecture. + // If not, skip showing this configuration to the user, as it's not safe to use this + // sampling method on this device. + if(is_pc_sampling_method_supported(ioctl_cfg.method, agent, pcs_ioctl_version) != + ROCPROFILER_STATUS_SUCCESS) + continue; + auto rocp_cfg = common::init_public_api_struct(rocprofiler_pc_sampling_configuration_t{}); auto rocp_ret = convert_ioctl_pcs_config_to_rocp(ioctl_cfg, rocp_cfg); if(rocp_ret != ROCPROFILER_STATUS_SUCCESS) @@ -470,12 +578,15 @@ ioctl_pcs_create(const rocprofiler_agent_t* agent, uint64_t interval, uint32_t* ioctl_pcs_id) { - if(auto status = is_pc_sampling_supported(agent); status != ROCPROFILER_STATUS_SUCCESS) - return status; + pcs_ioctl_version_t pcs_ioctl_version = 0; + auto status = get_pcs_ioctl_version_if_kfd_supports(agent->gpu_id, &pcs_ioctl_version); + if(status != ROCPROFILER_STATUS_SUCCESS) return status; - // Block configuring stochastic sampling until it's fully supported. - if(method == ROCPROFILER_PC_SAMPLING_METHOD_STOCHASTIC) - return ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT; + // Strict check: whether the driver version (safely) supports the sampling method for + // this specific device architecture. If not, return an error and prevent the user from + // using this sampling method on this device. + status = is_pc_sampling_method_supported(method, agent, pcs_ioctl_version); + if(status != ROCPROFILER_STATUS_SUCCESS) return status; rocprofiler_ioctl_pc_sampling_info_t ioctl_cfg; auto ret = create_ioctl_pcs_config_from_rocp(ioctl_cfg, method, unit, interval); @@ -501,17 +612,27 @@ ioctl_pcs_create(const rocprofiler_agent_t* agent, auto ioctl_ret = ioctl(get_kfd_fd(), AMDKFD_IOC_PC_SAMPLE, &args); *ioctl_pcs_id = args.trace_id; - if(ioctl_ret != 0 && (errno == EBUSY || errno == EEXIST)) + if(ioctl_ret != 0) { - // Currently, KFD uses EBUSY when e.g., PC sampling create is requested from - // withing the ROCgdb. - // On the other hand, EEXIST is used when one tries to create a PC sampling - // with a configuration different than the one already active. - return ROCPROFILER_STATUS_ERROR_NOT_AVAILABLE; - } - else if(ioctl_ret != 0) - { - return ROCPROFILER_STATUS_ERROR; + if(errno == EBUSY || errno == EEXIST) + { + // Currently, KFD uses EBUSY when e.g., PC sampling create is requested from + // withing the ROCgdb. + // On the other hand, EEXIST is used when one tries to create a PC sampling + // with a configuration different than the one already active. + return ROCPROFILER_STATUS_ERROR_NOT_AVAILABLE; + } + else if(errno == EINVAL) + { + // invalid argument (e.g., interval must be power of 2, but a value that's + // not power of 2 is provided) + return ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT; + } + else + { + // generic error + return ROCPROFILER_STATUS_ERROR; + } } return ROCPROFILER_STATUS_SUCCESS; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter_types.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter_types.hpp index 26dd7241f4..85bb204b6a 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter_types.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter_types.hpp @@ -43,7 +43,7 @@ namespace ioctl // 4. ROCPROFILER_IOCTL_STATUS_UNAVAILABLE // We might replace 1, 2, and 4 with rocprofiler_status_t, but still lacking a counterpart // for the ROCPROFILER_IOCTL_STATUS_BUFFER_TOO_SMALL -typedef enum _ROCPROFILER_IOCTL_STATUS +typedef enum rocprofiler_ioctl_status_t { ROCPROFILER_IOCTL_STATUS_SUCCESS = 0, /// Operation successful // USED ROCPROFILER_IOCTL_STATUS_ERROR = 1, /// General error return if not otherwise specified // USED @@ -72,7 +72,7 @@ typedef enum _ROCPROFILER_IOCTL_STATUS ROCPROFILER_IOCTL_STATUS_MEMORY_ALREADY_REGISTERED = 35, /// Memory buffer already registered ROCPROFILER_IOCTL_STATUS_MEMORY_NOT_REGISTERED = 36, /// Memory buffer not registered ROCPROFILER_IOCTL_STATUS_MEMORY_ALIGNMENT = 37, /// Memory parameter not aligned -} ROCPROFILER_IOCTL_STATUS; +} rocprofiler_ioctl_status_t; typedef struct rocprofiler_ioctl_version_info_s { @@ -80,27 +80,29 @@ typedef struct rocprofiler_ioctl_version_info_s uint32_t minor_version; /// supported IOCTL interface minor version } rocprofiler_ioctl_version_info_t; -typedef enum _ROCPROFILER_IOCTL_PC_SAMPLING_METHOD_KIND +typedef enum rocprofiler_ioctl_pc_sampling_method_kind_t { + ROCPROFILER_IOCTL_PC_SAMPLING_METHOD_KIND_NONE = 0, ROCPROFILER_IOCTL_PC_SAMPLING_METHOD_KIND_HOSTTRAP_V1 = 1, ROCPROFILER_IOCTL_PC_SAMPLING_METHOD_KIND_STOCHASTIC_V1, -} ROCPROFILER_IOCTL_PC_SAMPLING_METHOD_KIND; + ROCPROFILER_IOCTL_PC_SAMPLING_METHOD_KIND_LAST, +} rocprofiler_ioctl_pc_sampling_method_kind_t; -typedef enum _ROCPROFILER_IOCTL_PC_SAMPLING_UNITS +typedef enum rocprofiler_ioctl_pc_sampling_unit_interval_t { ROCPROFILER_IOCTL_PC_SAMPLING_UNIT_INTERVAL_MICROSECONDS, ROCPROFILER_IOCTL_PC_SAMPLING_UNIT_INTERVAL_CYCLES, ROCPROFILER_IOCTL_PC_SAMPLING_UNIT_INTERVAL_INSTRUCTIONS, -} ROCPROFILER_IOCTL_PC_SAMPLING_UNIT_INTERVAL; +} rocprofiler_ioctl_pc_sampling_unit_interval_t; typedef struct rocprofiler_ioctl_pc_sampling_info_s { - uint64_t interval; - uint64_t interval_min; - uint64_t interval_max; - uint64_t flags; - ROCPROFILER_IOCTL_PC_SAMPLING_METHOD_KIND method; - ROCPROFILER_IOCTL_PC_SAMPLING_UNIT_INTERVAL units; + uint64_t interval; + uint64_t interval_min; + uint64_t interval_max; + uint64_t flags; + rocprofiler_ioctl_pc_sampling_method_kind_t method; + rocprofiler_ioctl_pc_sampling_unit_interval_t units; } rocprofiler_ioctl_pc_sampling_info_t; } // namespace ioctl diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/CMakeLists.txt index 29eb608a82..6976064bb7 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/CMakeLists.txt +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/CMakeLists.txt @@ -1,7 +1,7 @@ set(ROCPROFILER_LIB_PC_SAMPLING_PARSER_SOURCES pc_record_interface.cpp) set(ROCPROFILER_LIB_PC_SAMPLING_PARSER_HEADERS correlation.hpp gfx9.hpp gfx11.hpp parser_types.hpp pc_record_interface.hpp rocr.h - stochastic_records.h translation.hpp) + translation.hpp) target_sources( rocprofiler-sdk-object-library PRIVATE ${ROCPROFILER_LIB_PC_SAMPLING_PARSER_SOURCES} diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/correlation.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/correlation.hpp index c866589832..3d23d0d10b 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/correlation.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/correlation.hpp @@ -241,6 +241,8 @@ add_upcoming_samples(const device_handle device, auto& pc_sample = samples[p]; pc_sample = copySample((const void*) (buffer + p)); + // skip invalid samples + if(pc_sample.size == 0) continue; // Convert PC -> (loaded code object id containing PC, offset within code object) if(!cache_addr_range.inrange(snap->pc)) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/gfx11.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/gfx11.hpp index eae7c60074..fc9504d39a 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/gfx11.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/gfx11.hpp @@ -32,7 +32,7 @@ public: TYPE_TEX, TYPE_LDS, TYPE_LDS_DIRECT, - TYPE_EXP, + TYPE_EXPORT, TYPE_MESSAGE, TYPE_BARRIER, TYPE_BRANCH_NOT_TAKEN, @@ -47,15 +47,15 @@ public: enum reason_not_issued { - REASON_NOT_AVAILABLE = 0, - REASON_ALU, + REASON_NO_INSTRUCTION_AVAILABLE = 0, + REASON_ALU_DEPENDENCY, REASON_WAITCNT, - REASON_ARBITER, - REASON_SLEEP, - REASON_BARRIER, + REASON_ARBITER_NOT_WIN, + REASON_SLEEP_WAIT, + REASON_BARRIER_WAIT, REASON_OTHER_WAIT, - REASON_INTERNAL = 31, - REASON_EX_STALL = 31, + REASON_INTERNAL_INSTRUCTION = 31, + REASON_ARBITER_WIN_EX_STALL = 31, }; enum arb_state diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/gfx9.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/gfx9.hpp index 9d7c815395..18cb6e8d08 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/gfx9.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/gfx9.hpp @@ -25,6 +25,7 @@ class GFX9 { public: + // matches values specified in perf_snapshot_data register enum inst_type_issued { TYPE_VALU = 0, @@ -33,7 +34,7 @@ public: TYPE_TEX, TYPE_LDS, TYPE_FLAT, - TYPE_EXP, + TYPE_EXPORT, TYPE_MESSAGE, TYPE_BARRIER, TYPE_BRANCH_NOT_TAKEN, @@ -46,30 +47,32 @@ public: TYPE_LDS_DIRECT = 31 }; + // matces values specified in perf_snapshot_data register enum reason_not_issued { - REASON_NOT_AVAILABLE = 0, - REASON_ALU, + REASON_NO_INSTRUCTION_AVAILABLE = 0, + REASON_ALU_DEPENDENCY, REASON_WAITCNT, - REASON_INTERNAL, - REASON_BARRIER, - REASON_ARBITER, - REASON_EX_STALL, + REASON_INTERNAL_INSTRUCTION, + REASON_BARRIER_WAIT, + REASON_ARBITER_NOT_WIN, + REASON_ARBITER_WIN_EX_STALL, REASON_OTHER_WAIT, REASON_LAST, - REASON_SLEEP = 31 + REASON_SLEEP_WAIT = 31 }; + // matches the order of arb_state bits in perf_snapshot_data register enum arb_state { - ISSUE_VALU = 0, - ISSUE_MATRIX, - ISSUE_SCALAR, - ISSUE_VMEM_TEX, - ISSUE_LDS, - ISSUE_FLAT, + ISSUE_MISC = 0, ISSUE_EXP, - ISSUE_MISC, + ISSUE_FLAT, + ISSUE_LDS, + ISSUE_VMEM_TEX, + ISSUE_SCALAR, + ISSUE_MATRIX, + ISSUE_VALU, ISSUE_LAST, ISSUE_LDS_DIRECT = 31, ISSUE_BRMSG = 31, diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/pc_record_interface.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/pc_record_interface.cpp index 9195cd402e..28efc28189 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/pc_record_interface.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/pc_record_interface.cpp @@ -22,6 +22,8 @@ #include "lib/rocprofiler-sdk/pc_sampling/parser/pc_record_interface.hpp" +#include "lib/common/utility.hpp" + template <> uint64_t PCSamplingParserContext::alloc( @@ -127,6 +129,43 @@ PCSamplingParserContext::shouldFlipRocrBuffer(const dispatch_pkt_id_t& pkt) cons return corr_map->checkDispatch(pkt); } +template +inline void +emplace_records_in_buffer(rocprofiler::buffer::instance* buff, + const PcSamplingRecordKindT* samples, + size_t num_samples, + rocprofiler_pc_sampling_record_kind_t record_kind) +{ + for(size_t i = 0; i < num_samples; i++) + buff->emplace(ROCPROFILER_BUFFER_CATEGORY_PC_SAMPLING, record_kind, samples[i]); +} + +template <> +inline void +emplace_records_in_buffer( + rocprofiler::buffer::instance* buff, + const rocprofiler_pc_sampling_record_stochastic_v0_t* samples, + size_t num_samples, + rocprofiler_pc_sampling_record_kind_t record_kind) +{ + for(size_t i = 0; i < num_samples; i++) + { + if(samples[i].size == 0) + { + // `size == 0` internally means invalid sample, so generate it. + auto invalid_sample = rocprofiler::common::init_public_api_struct( + rocprofiler_pc_sampling_record_invalid_t{}); + buff->emplace(ROCPROFILER_BUFFER_CATEGORY_PC_SAMPLING, + ROCPROFILER_PC_SAMPLING_RECORD_INVALID_SAMPLE, + invalid_sample); + } + else + { + buff->emplace(ROCPROFILER_BUFFER_CATEGORY_PC_SAMPLING, record_kind, samples[i]); + } + } +} + template void PCSamplingParserContext::generate_upcoming_pc_record( @@ -141,8 +180,7 @@ PCSamplingParserContext::generate_upcoming_pc_record( if(!buff) throw std::runtime_error(fmt::format("Buffer with id: {} does not exists", buff_id.handle)); - for(size_t i = 0; i < num_samples; i++) - buff->emplace(ROCPROFILER_BUFFER_CATEGORY_PC_SAMPLING, record_kind, samples[i]); + emplace_records_in_buffer(buff, samples, num_samples, record_kind); } template <> diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/pc_record_interface.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/pc_record_interface.hpp index d0473a33c6..a124d8fc15 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/pc_record_interface.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/pc_record_interface.hpp @@ -25,9 +25,9 @@ #include "lib/rocprofiler-sdk/buffer.hpp" #include "lib/rocprofiler-sdk/pc_sampling/parser/correlation.hpp" #include "lib/rocprofiler-sdk/pc_sampling/parser/parser_types.hpp" -#include "lib/rocprofiler-sdk/pc_sampling/parser/stochastic_records.h" #include +#include #include #include diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/gfx9test.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/gfx9test.cpp index 083016a555..2751013d6a 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/gfx9test.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/gfx9test.cpp @@ -30,75 +30,113 @@ #include #include -#include #include #define GFXIP_MAJOR 9 -#define TYPECHECK(x) \ - snapshots.push_back(rocprofiler_pc_sampling_snapshot_v1_t{.dual_issue_valu = 0, \ - .inst_type = ::PCSAMPLE::x, \ - .reason_not_issued = 0, \ - .arb_state_issue = 0, \ - .arb_state_stall = 0}); -#define UNROLL_TYPECHECK() \ - TYPECHECK(TYPE_VALU); \ - TYPECHECK(TYPE_MATRIX); \ - TYPECHECK(TYPE_SCALAR); \ - TYPECHECK(TYPE_TEX); \ - TYPECHECK(TYPE_LDS); \ - TYPECHECK(TYPE_FLAT); \ - TYPECHECK(TYPE_EXP); \ - TYPECHECK(TYPE_MESSAGE); \ - TYPECHECK(TYPE_BARRIER); \ - TYPECHECK(TYPE_BRANCH_NOT_TAKEN); \ - TYPECHECK(TYPE_BRANCH_TAKEN); \ - TYPECHECK(TYPE_JUMP); \ - TYPECHECK(TYPE_OTHER); \ - TYPECHECK(TYPE_NO_INST); +#define RECORD_INST_TYPE(x) \ + { \ + PcSamplingRecordT sample{}; \ + sample.inst_type = ROCPROFILER_PC_SAMPLING_INSTRUCTION##_##x; \ + snapshots.push_back(sample); \ + } -#define REASONCHECK(x) \ - snapshots.push_back(rocprofiler_pc_sampling_snapshot_v1_t{.dual_issue_valu = 0, \ - .inst_type = 0, \ - .reason_not_issued = ::PCSAMPLE::x, \ - .arb_state_issue = 0, \ - .arb_state_stall = 0}); -#define UNROLL_REASONCHECK(x) \ - REASONCHECK(REASON_NOT_AVAILABLE); \ - REASONCHECK(REASON_ALU); \ - REASONCHECK(REASON_WAITCNT); \ - REASONCHECK(REASON_INTERNAL); \ - REASONCHECK(REASON_BARRIER); \ - REASONCHECK(REASON_ARBITER); \ - REASONCHECK(REASON_EX_STALL); \ - REASONCHECK(REASON_OTHER_WAIT); +#define GENERATE_RECORDS_INST_TYPE() \ + RECORD_INST_TYPE(TYPE_VALU); \ + RECORD_INST_TYPE(TYPE_MATRIX); \ + RECORD_INST_TYPE(TYPE_SCALAR); \ + RECORD_INST_TYPE(TYPE_TEX); \ + RECORD_INST_TYPE(TYPE_LDS); \ + RECORD_INST_TYPE(TYPE_FLAT); \ + RECORD_INST_TYPE(TYPE_EXPORT); \ + RECORD_INST_TYPE(TYPE_MESSAGE); \ + RECORD_INST_TYPE(TYPE_BARRIER); \ + RECORD_INST_TYPE(TYPE_BRANCH_NOT_TAKEN); \ + RECORD_INST_TYPE(TYPE_BRANCH_TAKEN); \ + RECORD_INST_TYPE(TYPE_JUMP); \ + RECORD_INST_TYPE(TYPE_OTHER); \ + RECORD_INST_TYPE(TYPE_NO_INST); -#define ARBCHECK1(x, y) \ - snapshots.push_back( \ - rocprofiler_pc_sampling_snapshot_v1_t{.dual_issue_valu = 0, \ - .inst_type = 0, \ - .reason_not_issued = 0, \ - .arb_state_issue = 1 << ::PCSAMPLE::x, \ - .arb_state_stall = 1 << ::PCSAMPLE::y}); -#define ARBCHECK2(x) \ - ARBCHECK1(x, ISSUE_VALU); \ - ARBCHECK1(x, ISSUE_MATRIX); \ - ARBCHECK1(x, ISSUE_SCALAR); \ - ARBCHECK1(x, ISSUE_VMEM_TEX); \ - ARBCHECK1(x, ISSUE_LDS); \ - ARBCHECK1(x, ISSUE_FLAT); \ - ARBCHECK1(x, ISSUE_EXP); \ - ARBCHECK1(x, ISSUE_MISC); +#define RECORD_NOT_ISSUED_REASON(x) \ + { \ + PcSamplingRecordT sample{}; \ + sample.snapshot.reason_not_issued = ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED##_##x; \ + snapshots.push_back(sample); \ + } -#define UNROLL_ARBCHECK() \ - ARBCHECK2(ISSUE_VALU); \ - ARBCHECK2(ISSUE_MATRIX); \ - ARBCHECK2(ISSUE_SCALAR); \ - ARBCHECK2(ISSUE_VMEM_TEX); \ - ARBCHECK2(ISSUE_LDS); \ - ARBCHECK2(ISSUE_FLAT); \ - ARBCHECK2(ISSUE_EXP); \ - ARBCHECK2(ISSUE_MISC); +#define GENERATE_RECORDS_NOT_ISSUED_REASON(x) \ + RECORD_NOT_ISSUED_REASON(REASON_NO_INSTRUCTION_AVAILABLE); \ + RECORD_NOT_ISSUED_REASON(REASON_ALU_DEPENDENCY); \ + RECORD_NOT_ISSUED_REASON(REASON_WAITCNT); \ + RECORD_NOT_ISSUED_REASON(REASON_INTERNAL_INSTRUCTION); \ + RECORD_NOT_ISSUED_REASON(REASON_BARRIER_WAIT); \ + RECORD_NOT_ISSUED_REASON(REASON_ARBITER_NOT_WIN); \ + RECORD_NOT_ISSUED_REASON(REASON_ARBITER_WIN_EX_STALL); \ + RECORD_NOT_ISSUED_REASON(REASON_OTHER_WAIT); + +#define RECORD_ARBSTATE_ISSUE_STALL(x, y) \ + { \ + PcSamplingRecordT sample{}; \ + sample.snapshot.arb_state##_##x = 1; \ + sample.snapshot.arb_state##_##y = 1; \ + snapshots.push_back(sample); \ + } + +// Respecting the order of elements in GFX9:arb_state that match the order of arb_state bits +// in perf_snapshot_data register +#define RECORD_ARBSTATE_ISSUE(x) \ + RECORD_ARBSTATE_ISSUE_STALL(x, stall_misc); \ + RECORD_ARBSTATE_ISSUE_STALL(x, stall_exp); \ + RECORD_ARBSTATE_ISSUE_STALL(x, stall_flat); \ + RECORD_ARBSTATE_ISSUE_STALL(x, stall_lds); \ + RECORD_ARBSTATE_ISSUE_STALL(x, stall_vmem_tex); \ + RECORD_ARBSTATE_ISSUE_STALL(x, stall_scalar); \ + RECORD_ARBSTATE_ISSUE_STALL(x, stall_matrix); \ + RECORD_ARBSTATE_ISSUE_STALL(x, stall_valu); + +// Respecting the order of elements in GFX9:arb_state that match the order of arb_state bits +// in perf_snapshot_data register +#define GENERATE_RECORDS_ARBSTATE_ISSUE() \ + RECORD_ARBSTATE_ISSUE(issue_misc); \ + RECORD_ARBSTATE_ISSUE(issue_exp); \ + RECORD_ARBSTATE_ISSUE(issue_flat); \ + RECORD_ARBSTATE_ISSUE(issue_lds); \ + RECORD_ARBSTATE_ISSUE(issue_vmem_tex); \ + RECORD_ARBSTATE_ISSUE(issue_scalar); \ + RECORD_ARBSTATE_ISSUE(issue_matrix); \ + RECORD_ARBSTATE_ISSUE(issue_valu); + +#define NON_GFX9_ARBSTATE_IS_ZERO(x, y) \ + EXPECT_EQ(x.snapshot.arb_state_issue_lds_direct, 0); \ + EXPECT_EQ(y.snapshot.arb_state_issue_lds_direct, 0); \ + EXPECT_EQ(x.snapshot.arb_state_issue_brmsg, 0); \ + EXPECT_EQ(y.snapshot.arb_state_issue_brmsg, 0); \ + \ + EXPECT_EQ(x.snapshot.arb_state_stall_lds_direct, 0); \ + EXPECT_EQ(y.snapshot.arb_state_stall_lds_direct, 0); \ + EXPECT_EQ(x.snapshot.arb_state_stall_brmsg, 0); \ + EXPECT_EQ(y.snapshot.arb_state_stall_brmsg, 0); + +#define MATCH_ARBSTATE(x, y) \ + EXPECT_EQ(x.snapshot.arb_state_issue_valu, y.snapshot.arb_state_issue_valu); \ + EXPECT_EQ(x.snapshot.arb_state_issue_matrix, y.snapshot.arb_state_issue_matrix); \ + EXPECT_EQ(x.snapshot.arb_state_issue_lds, y.snapshot.arb_state_issue_lds); \ + EXPECT_EQ(x.snapshot.arb_state_issue_scalar, y.snapshot.arb_state_issue_scalar); \ + EXPECT_EQ(x.snapshot.arb_state_issue_vmem_tex, y.snapshot.arb_state_issue_vmem_tex); \ + EXPECT_EQ(x.snapshot.arb_state_issue_flat, y.snapshot.arb_state_issue_flat); \ + EXPECT_EQ(x.snapshot.arb_state_issue_exp, y.snapshot.arb_state_issue_exp); \ + EXPECT_EQ(x.snapshot.arb_state_issue_misc, y.snapshot.arb_state_issue_misc); \ + \ + EXPECT_EQ(x.snapshot.arb_state_stall_valu, y.snapshot.arb_state_stall_valu); \ + EXPECT_EQ(x.snapshot.arb_state_stall_matrix, y.snapshot.arb_state_stall_matrix); \ + EXPECT_EQ(x.snapshot.arb_state_stall_lds, y.snapshot.arb_state_stall_lds); \ + EXPECT_EQ(x.snapshot.arb_state_stall_scalar, y.snapshot.arb_state_stall_scalar); \ + EXPECT_EQ(x.snapshot.arb_state_stall_vmem_tex, y.snapshot.arb_state_stall_vmem_tex); \ + EXPECT_EQ(x.snapshot.arb_state_stall_flat, y.snapshot.arb_state_stall_flat); \ + EXPECT_EQ(x.snapshot.arb_state_stall_exp, y.snapshot.arb_state_stall_exp); \ + EXPECT_EQ(x.snapshot.arb_state_stall_misc, y.snapshot.arb_state_stall_misc); \ + \ + NON_GFX9_ARBSTATE_IS_ZERO(x, y) template class WaveSnapTest @@ -134,10 +172,11 @@ public: snap.correlation_id = dispatch->getMockId().raw; snap.perf_snapshot_data = (inst_type << 3) | (reason << 7); + snap.perf_snapshot_data |= 0x1; // sample is valid snap.perf_snapshot_data |= (arb_issue << 10) | (arb_stall << 18); snap.perf_snapshot_data1 = wave_cnt; - assert(dispatch.get()); + EXPECT_NE(dispatch.get(), nullptr); dispatch->submit(packet_union_t{.snap = snap}); }; @@ -156,180 +195,213 @@ public: this->buffer->genUpcomingSamples(max_wave_number); for(size_t i = 0; i < max_wave_number; i++) this->genPCSample( - i, GFX9::TYPE_LDS, GFX9::REASON_ALU, GFX9::ISSUE_VALU, GFX9::ISSUE_VALU); + i, GFX9::TYPE_LDS, GFX9::REASON_ALU_DEPENDENCY, GFX9::ISSUE_VALU, GFX9::ISSUE_VALU); } void CheckBuffers() override { auto parsed = this->buffer->get_parsed_buffer(9); // GFXIP==9 - assert(parsed.size() == 1); - assert(parsed[0].size() == max_wave_number); + EXPECT_EQ(parsed.size(), 1); + EXPECT_EQ(parsed[0].size(), max_wave_number); for(size_t i = 0; i < max_wave_number; i++) - assert(parsed[0][i].wave_count == i); + EXPECT_EQ(parsed[0][i].wave_count, i); } const size_t max_wave_number = 64; std::vector snapshots; }; -// class InstTypeTest : public WaveSnapTest -// { -// public: -// void FillBuffers() override -// { -// // Loop over inst_type_issued -// UNROLL_TYPECHECK(); -// buffer->genUpcomingSamples(GFX9::TYPE_LAST); -// for(int i = 0; i < GFX9::TYPE_LAST; i++) -// genPCSample(i, i, GFX9::REASON_ALU, GFX9::ISSUE_MATRIX, GFX9::ISSUE_MATRIX); -// } +template +class InstTypeTest : public WaveSnapTest +{ +public: + void FillBuffers() override + { + // Loop over inst_type_issued + GENERATE_RECORDS_INST_TYPE(); + this->buffer->genUpcomingSamples(GFX9::TYPE_LAST); + for(int i = 0; i < GFX9::TYPE_LAST; i++) + this->genPCSample( + i, i, GFX9::REASON_ALU_DEPENDENCY, GFX9::ISSUE_MATRIX, GFX9::ISSUE_MATRIX); + } -// void CheckBuffers() override -// { -// auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9 -// assert(parsed.size() == 1); -// assert(parsed[0].size() == GFX9::TYPE_LAST); -// assert(snapshots.size() == GFX9::TYPE_LAST); + void CheckBuffers() override + { + auto parsed = this->buffer->get_parsed_buffer(9); // GFXIP==9 + EXPECT_EQ(parsed.size(), 1); + EXPECT_EQ(parsed[0].size(), GFX9::TYPE_LAST); + EXPECT_EQ(snapshots.size(), GFX9::TYPE_LAST); -// for(size_t i = 0; i < GFX9::TYPE_LAST; i++) -// assert(snapshots[i].inst_type == parsed[0][i].snapshot.inst_type); -// } + for(size_t i = 0; i < GFX9::TYPE_LAST; i++) + EXPECT_EQ(snapshots[i].inst_type, parsed[0][i].inst_type); + } -// std::vector snapshots; -// }; + std::vector snapshots; +}; -// class StallReasonTest : public WaveSnapTest -// { -// public: -// void FillBuffers() override -// { -// // Loop over reason_not_issued -// UNROLL_REASONCHECK(); -// buffer->genUpcomingSamples(GFX9::REASON_LAST); -// for(int i = 0; i < GFX9::REASON_LAST; i++) -// genPCSample(i, GFX9::TYPE_MATRIX, i, GFX9::ISSUE_MATRIX, GFX9::ISSUE_MATRIX); -// } +template +class StallReasonTest : public WaveSnapTest +{ +public: + void FillBuffers() override + { + // Loop over reason_not_issued + GENERATE_RECORDS_NOT_ISSUED_REASON(); + this->buffer->genUpcomingSamples(GFX9::REASON_LAST); + for(int i = 0; i < GFX9::REASON_LAST; i++) + this->genPCSample(i, GFX9::TYPE_MATRIX, i, GFX9::ISSUE_MATRIX, GFX9::ISSUE_MATRIX); + } -// void CheckBuffers() override -// { -// auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9 -// assert(parsed.size() == 1); -// assert(parsed[0].size() == GFX9::REASON_LAST); -// assert(snapshots.size() == GFX9::REASON_LAST); + void CheckBuffers() override + { + auto parsed = this->buffer->get_parsed_buffer(9); // GFXIP==9 + EXPECT_EQ(parsed.size(), 1); + EXPECT_EQ(parsed[0].size(), GFX9::REASON_LAST); + EXPECT_EQ(snapshots.size(), GFX9::REASON_LAST); -// for(size_t i = 0; i < GFX9::REASON_LAST; i++) -// assert(snapshots[i].reason_not_issued == parsed[0][i].snapshot.reason_not_issued); -// } + for(size_t i = 0; i < GFX9::REASON_LAST; i++) + EXPECT_EQ(snapshots[i].snapshot.reason_not_issued, + parsed[0][i].snapshot.reason_not_issued); + } -// std::vector snapshots; -// }; + std::vector snapshots; +}; -// class ArbStateTest : public WaveSnapTest -// { -// public: -// void FillBuffers() override -// { -// // Loop over arb_state_issue -// UNROLL_ARBCHECK(); -// buffer->genUpcomingSamples(GFX9::ISSUE_LAST * GFX9::ISSUE_LAST); -// for(int i = 0; i < GFX9::ISSUE_LAST; i++) -// for(int j = 0; j < GFX9::ISSUE_LAST; j++) -// genPCSample(i, GFX9::TYPE_MATRIX, GFX9::REASON_ALU, 1 << i, 1 << j); -// } +template +class ArbStateTest : public WaveSnapTest +{ +public: + void FillBuffers() override + { + // Loop over arb_state_issue + GENERATE_RECORDS_ARBSTATE_ISSUE(); + this->buffer->genUpcomingSamples(GFX9::ISSUE_LAST * GFX9::ISSUE_LAST); + // To match the order of instantiating snapshots inside `GENERATE_RECORDS_ARBSTATE_ISSUE` + // we loop over GFX9:: + for(int i = 0; i < GFX9::ISSUE_LAST; i++) + for(int j = 0; j < GFX9::ISSUE_LAST; j++) + this->genPCSample( + i, GFX9::TYPE_MATRIX, GFX9::REASON_ALU_DEPENDENCY, 1 << i, 1 << j); + } -// void CheckBuffers() override -// { -// auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9 -// assert(parsed.size() == 1); -// assert(parsed[0].size() == GFX9::ISSUE_LAST * GFX9::ISSUE_LAST); -// assert(snapshots.size() == GFX9::ISSUE_LAST * GFX9::ISSUE_LAST); + void CheckBuffers() override + { + auto parsed = this->buffer->get_parsed_buffer(9); // GFXIP==9 + EXPECT_EQ(parsed.size(), 1); + EXPECT_EQ(parsed[0].size(), GFX9::ISSUE_LAST * GFX9::ISSUE_LAST); + EXPECT_EQ(snapshots.size(), GFX9::ISSUE_LAST * GFX9::ISSUE_LAST); -// for(size_t i = 0; i < GFX9::ISSUE_LAST * GFX9::ISSUE_LAST; i++) -// { -// auto& snap = snapshots[i]; -// assert(snap.arb_state_issue == parsed[0][i].snapshot.arb_state_issue); -// assert(snap.arb_state_stall == parsed[0][i].snapshot.arb_state_stall); -// } -// } + for(size_t i = 0; i < GFX9::ISSUE_LAST * GFX9::ISSUE_LAST; i++) + { + auto& snap = snapshots[i]; + MATCH_ARBSTATE(snap, parsed[0][i]) + } + } -// std::vector snapshots; -// }; + std::vector snapshots; +}; -// class WaveIssueAndErrorTest : public WaveSnapTest -// { -// void FillBuffers() override -// { -// buffer->genUpcomingSamples(16); -// for(int valid = 0; valid <= 1; valid++) -// for(int issued = 0; issued <= 1; issued++) -// for(int dual = 0; dual <= 1; dual++) -// for(int error = 0; error <= 1; error++) -// genPCSample(valid, issued, dual, error); -// } +template +class WaveIssueAndErrorTest : public WaveSnapTest +{ + struct pc_sampling_test_record_t + { + bool valid; + union + { + PcSamplingRecordT valid_record; + PcSamplingRecordInvalidT invalid_record; + }; + }; -// void CheckBuffers() override -// { -// const int num_combinations = 16; -// auto parsed = buffer->get_parsed_buffer(9); // GFXIP==9 -// assert(parsed.size() == 1); -// assert(parsed[0].size() == num_combinations); -// assert(compare.size() == num_combinations); + void FillBuffers() override + { + this->buffer->genUpcomingSamples(16); + for(int valid = 0; valid <= 1; valid++) + for(int issued = 0; issued <= 1; issued++) + for(int dual = 0; dual <= 1; dual++) + for(int error = 0; error <= 1; error++) + genPCSample(valid, issued, dual, error); + } -// for(size_t i = 0; i < num_combinations; i++) -// { -// assert(compare[i].flags.valid == parsed[0][i].flags.valid); -// assert(compare[i].wave_issued == parsed[0][i].wave_issued); -// assert(compare[i].snapshot.dual_issue_valu == parsed[0][i].snapshot.dual_issue_valu); -// } -// } + void CheckBuffers() override + { + const int num_combinations = 16; + auto parsed = this->buffer->get_parsed_buffer(9); // GFXIP==9 + EXPECT_EQ(parsed.size(), 1); + EXPECT_EQ(parsed[0].size(), num_combinations); + EXPECT_EQ(compare.size(), num_combinations); -// union trap_snapshot_v1 -// { -// struct -// { -// uint32_t valid : 1; -// uint32_t issued : 1; -// uint32_t dual : 1; -// uint32_t reserved : 23; -// uint32_t error : 1; -// uint32_t reserved2 : 5; -// }; -// uint32_t raw; -// }; + for(size_t i = 0; i < num_combinations; i++) + { + if(compare[i].valid) + { + EXPECT_EQ(compare[i].valid_record.wave_issued, parsed[0][i].wave_issued); + EXPECT_EQ(compare[i].valid_record.snapshot.dual_issue_valu, + parsed[0][i].snapshot.dual_issue_valu); + } + else + { + // Internally (inside the parser) invalid samples are represented with + // PcSamplingRecordT of size 0. Eventually, those records are replaced with the + // PcSamplingRecordInvalidT prior to putting inside the SDK buffer. + EXPECT_EQ(parsed[0][i].size, 0); + } + } + } -// void genPCSample(bool valid, bool issued, bool dual, bool error) -// { -// rocprofiler_pc_sampling_record_t sample; -// ::memset(&sample, 0, sizeof(sample)); -// // TODO: Since code objects are not mocked, use pc.code_object_offset -// // as the absolute physical address of the mocked PC. -// sample.pc.code_object_offset = dispatch->unique_id; + union trap_snapshot_v1 + { + struct + { + uint32_t valid : 1; + uint32_t issued : 1; + uint32_t dual : 1; + uint32_t reserved : 23; + uint32_t error : 1; + uint32_t reserved2 : 5; + }; + uint32_t raw; + }; -// sample.correlation_id.internal = dispatch->getMockId().raw; + void genPCSample(bool valid, bool issued, bool dual, bool error) + { + pc_sampling_test_record_t record{}; + record.valid = valid && !error; + if(record.valid) + { + // Fill in the data for the valid record. + auto& sample = record.valid_record; -// sample.flags.valid = valid && !error; -// sample.wave_issued = issued; -// sample.snapshot.dual_issue_valu = dual; + // TODO: Since code objects are not mocked, use pc.code_object_offset + // as the absolute physical address of the mocked PC. + sample.pc.code_object_offset = this->dispatch->unique_id; -// assert(dispatch.get()); + sample.correlation_id.internal = this->dispatch->getMockId().raw; -// compare.push_back(sample); + sample.wave_issued = issued; + sample.snapshot.dual_issue_valu = dual; -// trap_snapshot_v1 snap; -// snap.valid = valid; -// snap.issued = issued; -// snap.dual = dual; -// snap.error = error; + EXPECT_NE(this->dispatch.get(), nullptr); + } -// perf_sample_snapshot_v1 pss; -// pss.perf_snapshot_data = snap.raw; -// pss.correlation_id = dispatch->getMockId().raw; -// dispatch->submit(std::move(pss)); -// }; + compare.push_back(record); -// std::vector compare; -// }; + trap_snapshot_v1 snap; + snap.valid = valid; + snap.issued = issued; + snap.dual = dual; + snap.error = error; + + perf_sample_snapshot_v1 pss; + pss.perf_snapshot_data = snap.raw; + pss.correlation_id = this->dispatch->getMockId().raw; + this->dispatch->submit(std::move(pss)); + }; + + std::vector compare; +}; template class HwIdTest : public WaveSnapTest @@ -405,23 +477,23 @@ class HwIdTest : public WaveSnapTest void CheckBuffers() override { auto parsed = this->buffer->get_parsed_buffer(9); // GFXIP==9 - assert(parsed.size() == 1); - assert(parsed[0].size() == 3); - assert(compare.size() == 3); + EXPECT_EQ(parsed.size(), 1); + EXPECT_EQ(parsed[0].size(), 3); + EXPECT_EQ(compare.size(), 3); for(size_t i = 0; i < 3; i++) { // Comparing individual fields - assert(compare[i].hw_id.wave_id == parsed[0][i].hw_id.wave_id); - assert(compare[i].hw_id.simd_id == parsed[0][i].hw_id.simd_id); - assert(compare[i].hw_id.pipe_id == parsed[0][i].hw_id.pipe_id); - assert(compare[i].hw_id.cu_or_wgp_id == parsed[0][i].hw_id.cu_or_wgp_id); - assert(compare[i].hw_id.shader_array_id == parsed[0][i].hw_id.shader_array_id); - assert(compare[i].hw_id.shader_engine_id == parsed[0][i].hw_id.shader_engine_id); - assert(compare[i].hw_id.workgroup_id == parsed[0][i].hw_id.workgroup_id); - assert(compare[i].hw_id.vm_id == parsed[0][i].hw_id.vm_id); - assert(compare[i].hw_id.queue_id == parsed[0][i].hw_id.queue_id); - assert(compare[i].hw_id.microengine_id == parsed[0][i].hw_id.microengine_id); + EXPECT_EQ(compare[i].hw_id.wave_id, parsed[0][i].hw_id.wave_id); + EXPECT_EQ(compare[i].hw_id.simd_id, parsed[0][i].hw_id.simd_id); + EXPECT_EQ(compare[i].hw_id.pipe_id, parsed[0][i].hw_id.pipe_id); + EXPECT_EQ(compare[i].hw_id.cu_or_wgp_id, parsed[0][i].hw_id.cu_or_wgp_id); + EXPECT_EQ(compare[i].hw_id.shader_array_id, parsed[0][i].hw_id.shader_array_id); + EXPECT_EQ(compare[i].hw_id.shader_engine_id, parsed[0][i].hw_id.shader_engine_id); + EXPECT_EQ(compare[i].hw_id.workgroup_id, parsed[0][i].hw_id.workgroup_id); + EXPECT_EQ(compare[i].hw_id.vm_id, parsed[0][i].hw_id.vm_id); + EXPECT_EQ(compare[i].hw_id.queue_id, parsed[0][i].hw_id.queue_id); + EXPECT_EQ(compare[i].hw_id.microengine_id, parsed[0][i].hw_id.microengine_id); } } @@ -451,8 +523,9 @@ class HwIdTest : public WaveSnapTest // raw register value snap.hw_id = hw_id.raw; snap.correlation_id = this->dispatch->getMockId().raw; + snap.perf_snapshot_data |= 0x1; // sample is valid - assert(this->dispatch.get()); + EXPECT_NE(this->dispatch.get(), nullptr); this->dispatch->submit(snap); }; @@ -473,26 +546,26 @@ class WaveOtherFieldsTest : public WaveSnapTest void CheckBuffers() override { auto parsed = this->buffer->get_parsed_buffer(9); // GFXIP==9 - assert(parsed.size() == 1); - assert(parsed[0].size() == 3); - assert(compare.size() == 3); + EXPECT_EQ(parsed.size(), 1); + EXPECT_EQ(parsed[0].size(), 3); + EXPECT_EQ(compare.size(), 3); for(size_t i = 0; i < 3; i++) { // TODO: if we decide to test flags, make specialization for // rocprofiler_pc_sampling_record_stochastic_v0_t - // assert(parsed[0][i].flags.has_stall_reason == true); - // assert(parsed[0][i].flags.has_wave_cnt == true); - // assert(parsed[0][i].flags.reserved == false); + // EXPECT_EQ(parsed[0][i].flags.has_stall_reason, true); + // EXPECT_EQ(parsed[0][i].flags.has_wave_cnt, true); + // EXPECT_EQ(parsed[0][i].flags.reserved, false); - assert(compare[i].exec_mask == parsed[0][i].exec_mask); - assert(compare[i].workgroup_id == parsed[0][i].workgroup_id); + EXPECT_EQ(compare[i].exec_mask, parsed[0][i].exec_mask); + EXPECT_EQ(compare[i].workgroup_id, parsed[0][i].workgroup_id); - assert(compare[i].hw_id.chiplet == parsed[0][i].hw_id.chiplet); - assert(compare[i].wave_in_group == parsed[0][i].wave_in_group); + EXPECT_EQ(compare[i].hw_id.chiplet, parsed[0][i].hw_id.chiplet); + EXPECT_EQ(compare[i].wave_in_group, parsed[0][i].wave_in_group); // TODO: handle HW_ID as well. - // assert(compare[i].hw_id == parsed[0][i].hw_id); - assert(compare[i].correlation_id.internal == parsed[0][i].correlation_id.internal); + // EXPECT_EQ(compare[i].hw_id, parsed[0][i].hw_id); + EXPECT_EQ(compare[i].correlation_id.internal, parsed[0][i].correlation_id.internal); } } @@ -525,7 +598,11 @@ class WaveOtherFieldsTest : public WaveSnapTest snap.chiplet_and_wave_id = (chip << 8) | (wave & 0x3F); snap.correlation_id = this->dispatch->getMockId().raw; - assert(this->dispatch.get()); + // to ensure all stochastic samples are generated properly, + // marked them as valid + snap.perf_snapshot_data |= 0x1; // set the bit indicating the sample is valid + + EXPECT_NE(this->dispatch.get(), nullptr); this->dispatch->submit(snap); (void) pc; @@ -538,10 +615,12 @@ TEST(pcs_parser, gfx9_test) { // Tests specific to stochastic sampling only WaveCntTest{}.Test(); - // InstTypeTest{}.Test(); - // StallReasonTest{}.Test(); - // ArbStateTest{}.Test(); - // WaveIssueAndErrorTest{}.Test(); + InstTypeTest{}.Test(); + StallReasonTest{}.Test(); + ArbStateTest{}.Test(); + WaveIssueAndErrorTest{} + .Test(); // Tests commong for both host trap and stochastic sampling. HwIdTest{}.Test(); diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/mocks.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/mocks.hpp index 36558b7915..ed8d10a085 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/mocks.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/mocks.hpp @@ -309,6 +309,8 @@ public: ::memset(&uni, 0, sizeof(uni)); uni.snap.pc = dispatch->unique_id; uni.snap.correlation_id = dispatch->getMockId().raw; + // mark sample valid in case of stochastic sampling tests + uni.snap.perf_snapshot_data |= 0x1; // stochastic sample is valid dispatch->submit(uni); }; void print() diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/multigpu.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/multigpu.cpp index c546973986..d839d68fed 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/multigpu.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/tests/multigpu.cpp @@ -250,6 +250,7 @@ multithread_codeobj(size_t tid, Latch* latch) for(int s = 0; s < NUM_SAMPLES; s++) { uni.snap.pc = pc_base_addr + s; + uni.snap.perf_snapshot_data |= 0x1; // sample is valid dispatch->submit(uni); } diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/translation.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/translation.hpp index 7e74575a8b..c5bfd162ae 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/translation.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/parser/translation.hpp @@ -30,7 +30,8 @@ #include "lib/rocprofiler-sdk/pc_sampling/parser/gfx9.hpp" #include "lib/rocprofiler-sdk/pc_sampling/parser/parser_types.hpp" #include "lib/rocprofiler-sdk/pc_sampling/parser/rocr.h" -#include "lib/rocprofiler-sdk/pc_sampling/parser/stochastic_records.h" + +#include // TODO: refactor the commented code for stochastic sampling @@ -51,7 +52,6 @@ // ret.wave_count = sample.perf_snapshot_data1 & 0x3F; -// ret.wave_issued = sample.perf_snapshot_data >> 1; // ret.snapshot.dual_issue_valu = sample.perf_snapshot_data >> 2; // ret.snapshot.inst_type = sample.perf_snapshot_data >> 3; // ret.snapshot.reason_not_issued = (sample.perf_snapshot_data >> 7) & 0x7; @@ -103,90 +103,76 @@ // #undef BITSHIFT -// #define LUTOVERLOAD(sname) this->operator[](GFX::sname) = PCSAMPLE::sname +#define LUTOVERLOAD(sname, rocp_prefix) this->operator[](GFX::sname) = rocp_prefix##_##sname +#define LUTOVERLOAD_INST(sname) LUTOVERLOAD(sname, ROCPROFILER_PC_SAMPLING_INSTRUCTION) +#define LUTOVERLOAD_INST_NOT_ISSUED(sname) \ + LUTOVERLOAD(sname, ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED) -// template -// class GFX_REASON_LUT : public std::array -// { -// public: -// GFX_REASON_LUT() -// { -// std::memset(data(), 0, size() * sizeof(int)); -// LUTOVERLOAD(REASON_NOT_AVAILABLE); -// LUTOVERLOAD(REASON_ALU); -// LUTOVERLOAD(REASON_WAITCNT); -// LUTOVERLOAD(REASON_INTERNAL); -// LUTOVERLOAD(REASON_BARRIER); -// LUTOVERLOAD(REASON_ARBITER); -// LUTOVERLOAD(REASON_EX_STALL); -// LUTOVERLOAD(REASON_OTHER_WAIT); -// LUTOVERLOAD(REASON_SLEEP); -// } -// }; +template +struct gfx_inst_lut : public std::array +{ + gfx_inst_lut() + { + std::memset(data(), 0, size() * sizeof(int)); + LUTOVERLOAD_INST(TYPE_VALU); + LUTOVERLOAD_INST(TYPE_MATRIX); + LUTOVERLOAD_INST(TYPE_SCALAR); + LUTOVERLOAD_INST(TYPE_TEX); + LUTOVERLOAD_INST(TYPE_LDS); + LUTOVERLOAD_INST(TYPE_LDS_DIRECT); + LUTOVERLOAD_INST(TYPE_FLAT); + LUTOVERLOAD_INST(TYPE_EXPORT); + LUTOVERLOAD_INST(TYPE_MESSAGE); + LUTOVERLOAD_INST(TYPE_BARRIER); + LUTOVERLOAD_INST(TYPE_BRANCH_NOT_TAKEN); + LUTOVERLOAD_INST(TYPE_BRANCH_TAKEN); + LUTOVERLOAD_INST(TYPE_JUMP); + LUTOVERLOAD_INST(TYPE_OTHER); + LUTOVERLOAD_INST(TYPE_NO_INST); + LUTOVERLOAD_INST(TYPE_DUAL_VALU); + } +}; -// template -// class GFX_INST_LUT : public std::array -// { -// public: -// GFX_INST_LUT() -// { -// std::memset(data(), 0, size() * sizeof(int)); -// LUTOVERLOAD(TYPE_VALU); -// LUTOVERLOAD(TYPE_MATRIX); -// LUTOVERLOAD(TYPE_SCALAR); -// LUTOVERLOAD(TYPE_TEX); -// LUTOVERLOAD(TYPE_LDS); -// LUTOVERLOAD(TYPE_LDS_DIRECT); -// LUTOVERLOAD(TYPE_FLAT); -// LUTOVERLOAD(TYPE_EXP); -// LUTOVERLOAD(TYPE_MESSAGE); -// LUTOVERLOAD(TYPE_BARRIER); -// LUTOVERLOAD(TYPE_BRANCH_NOT_TAKEN); -// LUTOVERLOAD(TYPE_BRANCH_TAKEN); -// LUTOVERLOAD(TYPE_JUMP); -// LUTOVERLOAD(TYPE_OTHER); -// LUTOVERLOAD(TYPE_NO_INST); -// LUTOVERLOAD(TYPE_DUAL_VALU); -// } -// }; +template +struct gfx_reason_lut : public std::array +{ + gfx_reason_lut() + { + std::memset(data(), 0, size() * sizeof(int)); + LUTOVERLOAD_INST_NOT_ISSUED(REASON_NO_INSTRUCTION_AVAILABLE); + LUTOVERLOAD_INST_NOT_ISSUED(REASON_ALU_DEPENDENCY); + LUTOVERLOAD_INST_NOT_ISSUED(REASON_WAITCNT); + LUTOVERLOAD_INST_NOT_ISSUED(REASON_INTERNAL_INSTRUCTION); + LUTOVERLOAD_INST_NOT_ISSUED(REASON_BARRIER_WAIT); + LUTOVERLOAD_INST_NOT_ISSUED(REASON_ARBITER_NOT_WIN); + LUTOVERLOAD_INST_NOT_ISSUED(REASON_ARBITER_WIN_EX_STALL); + LUTOVERLOAD_INST_NOT_ISSUED(REASON_OTHER_WAIT); + LUTOVERLOAD_INST_NOT_ISSUED(REASON_SLEEP_WAIT); + } +}; -// template -// inline int -// translate_reason(int in) -// { -// static GFX_REASON_LUT lut; -// return lut[in & 0x1F]; -// } +template +inline int +translate_inst(int in) +{ + static gfx_inst_lut lut; + return lut[in & 0x1F]; +} -// template -// inline int -// translate_inst(int in) -// { -// static GFX_INST_LUT lut; -// return lut[in & 0x1F]; -// } +template +inline int +translate_reason(int in) +{ + static gfx_reason_lut lut; + return lut[in & 0x1F]; +} -// #undef LUTOVERLOAD - -// template -// inline rocprofiler_pc_sampling_record_t -// copySample(const void* sample) -// { -// if(HostTrap) return copyHostTrapSample(*(const perf_sample_host_trap_v1*) sample); - -// rocprofiler_pc_sampling_record_t ret = -// copyStochasticSample(*(const perf_sample_snapshot_v1*) sample); - -// ret.snapshot.inst_type = translate_inst(ret.snapshot.inst_type); -// ret.snapshot.arb_state_issue = translate_arb(ret.snapshot.arb_state_issue); -// ret.snapshot.arb_state_stall = translate_arb(ret.snapshot.arb_state_stall); -// ret.snapshot.reason_not_issued = translate_reason(ret.snapshot.reason_not_issued); - -// return ret; -// } +#undef LUTOVERLOAD_INST_NOT_ISSUED +#undef LUTOVERLOAD_INST +#undef LUTOVERLOAD #define EXTRACT_BITS(val, bit_end, bit_start) \ - (val >> bit_start) & ((1U << (bit_end - bit_start + 1)) - 1) + ((val >> bit_start) & ((1U << (bit_end - bit_start + 1)) - 1)) template inline void @@ -228,8 +214,6 @@ copyHwId(rocprofiler_pc_sampling_hw_id hw_id.microengine_id = EXTRACT_BITS(hw_id_reg, 31, 30); } -#undef EXTRACT_BITS - template inline PcSamplingRecordT copySampleHeader(const SType& sample) @@ -276,11 +260,65 @@ inline rocprofiler_pc_sampling_record_stochastic_v0_t copySample(const void* sample) { const auto& sample_ = *static_cast(sample); - auto ret = copySampleHeader(sample_); + + // Extracting data from the perf_snapshot_data register + auto perf_snapshot_data = sample_.perf_snapshot_data; + // The sample is valid iff neither of perf_snapshot_data.valid and perf_snapshot_data.error == 0 + // is one + auto valid = static_cast(EXTRACT_BITS(perf_snapshot_data, 0, 0) & + ~EXTRACT_BITS(perf_snapshot_data, 26, 26)); + if(!valid) + { + // To reduce refactoring of the PC sampling parser, we agreed to internally represent + // invalid samples with `rocprofiler_pc_sampling_record_stochastic_v0_t` with size 0. + // Eventually, those records are replaced with rocprofiler_pc_sampling_record_invalid_t + // and placed into the SDK buffer consumed by the end tool. + rocprofiler_pc_sampling_record_stochastic_v0_t invalid{}; + invalid.size = 0; + // No need to further process invalid samples + return invalid; + } + + auto ret = copySampleHeader(sample_); copyChipletId(ret, sample_); copyHwId(ret.hw_id, sample_.hw_id); - ret.wave_count = sample_.perf_snapshot_data1 & 0x3F; - // TODO: implement logic for manipulating stochastic related fields + + // no memory counters on GFX9 + ret.flags.has_memory_counter = false; + + // wave issued an instruction + ret.wave_issued = EXTRACT_BITS(perf_snapshot_data, 1, 1); + // type of issued instruction, valid only if `ret.wave_issued` is true. + ret.inst_type = translate_inst(EXTRACT_BITS(perf_snapshot_data, 6, 3)); + // two VALU instructions issued in this cycles + ret.snapshot.dual_issue_valu = EXTRACT_BITS(perf_snapshot_data, 2, 2); + // reason for not issuing an instruction, valid only if `ret.wave_issued` is false + ret.snapshot.reason_not_issued = translate_reason(EXTRACT_BITS(perf_snapshot_data, 9, 7)); + + // arbiter state information + uint16_t arb_state = EXTRACT_BITS(perf_snapshot_data, 25, 10); + ret.snapshot.arb_state_issue_valu = EXTRACT_BITS(arb_state, 7, 7); + ret.snapshot.arb_state_issue_matrix = EXTRACT_BITS(arb_state, 6, 6); + ret.snapshot.arb_state_issue_lds = EXTRACT_BITS(arb_state, 3, 3); + ret.snapshot.arb_state_issue_scalar = EXTRACT_BITS(arb_state, 5, 5); + ret.snapshot.arb_state_issue_vmem_tex = EXTRACT_BITS(arb_state, 4, 4); + ret.snapshot.arb_state_issue_flat = EXTRACT_BITS(arb_state, 2, 2); + ret.snapshot.arb_state_issue_exp = EXTRACT_BITS(arb_state, 1, 1); + ret.snapshot.arb_state_issue_misc = EXTRACT_BITS(arb_state, 0, 0); + + ret.snapshot.arb_state_stall_valu = EXTRACT_BITS(arb_state, 15, 15); + ret.snapshot.arb_state_stall_matrix = EXTRACT_BITS(arb_state, 14, 14); + ret.snapshot.arb_state_stall_lds = EXTRACT_BITS(arb_state, 11, 11); + ret.snapshot.arb_state_stall_scalar = EXTRACT_BITS(arb_state, 13, 13); + ret.snapshot.arb_state_stall_vmem_tex = EXTRACT_BITS(arb_state, 12, 12); + ret.snapshot.arb_state_stall_flat = EXTRACT_BITS(arb_state, 10, 10); + ret.snapshot.arb_state_stall_exp = EXTRACT_BITS(arb_state, 9, 9); + ret.snapshot.arb_state_stall_misc = EXTRACT_BITS(arb_state, 8, 8); + + // Extracting data from the perf_snapshot_data1 register + // Active waves on CU at the moment of sampling + ret.wave_count = EXTRACT_BITS(sample_.perf_snapshot_data1, 5, 0); + return ret; } @@ -309,3 +347,5 @@ copySample(const void* sa // ret.wave_count = sample_.perf_snapshot_data1 & 0x3F; return ret; } + +#undef EXTRACT_BITS diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/service.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/service.cpp index 104f4a0e70..ae18b27c00 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/service.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/service.cpp @@ -263,6 +263,7 @@ flush_internal_agent_buffers(rocprofiler_buffer_id_t buffer_id) auto* service = get_configured_pc_sampling_service().load(); if(service && ctx->pc_sampler.get() == service) { + rocprofiler_status_t status = ROCPROFILER_STATUS_SUCCESS; // The context `ctx` (that holds the buffer with `buffer_id`) // is the one containing PC sampling service. // The HSA interception table is registered. @@ -272,7 +273,10 @@ flush_internal_agent_buffers(rocprofiler_buffer_id_t buffer_id) if(agent_session->buffer_id.handle == buffer_id.handle) { // Flush internal PC sampling buffers filled by the agent - return hsa::flush_internal_agent_buffers(agent_session.get()); + // NOTE: one rocprofiler-SDK PC sampling buffer can be tied + // to multiple agent (agent sessions). + status = hsa::flush_internal_agent_buffers(agent_session.get()); + if(status != ROCPROFILER_STATUS_SUCCESS) return status; } } } @@ -281,6 +285,41 @@ flush_internal_agent_buffers(rocprofiler_buffer_id_t buffer_id) return ROCPROFILER_STATUS_SUCCESS; } +rocprofiler_status_t +flush_all_agent_buffers() +{ + auto* service = get_configured_pc_sampling_service().load(); + if(!service) return ROCPROFILER_STATUS_ERROR; + + rocprofiler_status_t status = ROCPROFILER_STATUS_SUCCESS; + // Loop over all agents that have PC sampling service configured + // and drain their internal buffers. + // NOTE: one SDK buffer can consume data from multiple agents + // (multiple HSA runtime buffers) + for(const auto& [_, agent_session] : service->agent_sessions) + { + status = flush_internal_agent_buffers(agent_session->buffer_id); + if(status != ROCPROFILER_STATUS_SUCCESS) + { + ROCP_ERROR << "Failed to flush internal HSA buffers tied to rocp buffer " + << agent_session->buffer_id.handle; + } + } + return status; +} + +void +service_sync() +{ + flush_all_agent_buffers(); +} + +void +service_fini() +{ + flush_all_agent_buffers(); +} + } // namespace pc_sampling } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/service.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/service.hpp index a03d344b15..0edd0a2ec5 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/service.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/service.hpp @@ -67,6 +67,12 @@ is_pc_sample_service_configured(rocprofiler_agent_id_t agent_id); rocprofiler_status_t flush_internal_agent_buffers(rocprofiler_buffer_id_t buffer_id); + +void +service_sync(); + +void +service_fini(); } // namespace pc_sampling } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/tests/configure_service.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/tests/configure_service.cpp index dbd7302a06..927b4a740b 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/tests/configure_service.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/pc_sampling/tests/configure_service.cpp @@ -138,8 +138,9 @@ find_all_gpu_agents_supporting_pc_sampling_impl(rocprofiler_agent_version_t vers return ROCPROFILER_STATUS_SUCCESS; } -const rocprofiler_pc_sampling_configuration_t -extract_pc_sampling_config_prefer_stochastic(rocprofiler_agent_id_t agent_id) +rocprofiler_pc_sampling_configuration_t +extract_pc_sampling_config_prefer(rocprofiler_pc_sampling_method_t method, + rocprofiler_agent_id_t agent_id) { auto cb = [](const rocprofiler_pc_sampling_configuration_t* configs, size_t num_config, @@ -158,31 +159,46 @@ extract_pc_sampling_config_prefer_stochastic(rocprofiler_agent_id_t agent_id) ROCPROFILER_CALL(rocprofiler_query_pc_sampling_agent_configurations(agent_id, cb, &configs), "Failed to query available configurations"); - const rocprofiler_pc_sampling_configuration_t* first_host_trap_config = nullptr; - const rocprofiler_pc_sampling_configuration_t* first_stochastic_config = nullptr; - // Search until encountering on the stochastic configuration, if any. - // Otherwise, use the host trap config + const rocprofiler_pc_sampling_configuration_t* first_preferred_method_config = nullptr; + const rocprofiler_pc_sampling_configuration_t* first_remained_method_config = nullptr; + // Search until encountering the prefered method configuration, if any. + // Otherwise, use what remained. for(auto const& cfg : configs) { - if(cfg.method == ROCPROFILER_PC_SAMPLING_METHOD_STOCHASTIC) + if(cfg.method == method) { - // Temporarily disable stochastic sampling as it's not fully supported. - // first_stochastic_config = &cfg; - // break; + first_preferred_method_config = &cfg; + break; } - else if(!first_host_trap_config && cfg.method == ROCPROFILER_PC_SAMPLING_METHOD_HOST_TRAP) + else if(!first_remained_method_config && + cfg.method != ROCPROFILER_PC_SAMPLING_METHOD_NONE && + cfg.method != ROCPROFILER_PC_SAMPLING_METHOD_LAST) { - first_host_trap_config = &cfg; + first_remained_method_config = &cfg; } } - // Check if the stochastic config is found. Use host trap config otherwise. + // Check if the config with the preferred method is found. Use config with other method + // otherwise. const rocprofiler_pc_sampling_configuration_t* picked_cfg = - (first_stochastic_config != nullptr) ? first_stochastic_config : first_host_trap_config; + (first_preferred_method_config != nullptr) ? first_preferred_method_config + : first_remained_method_config; return *picked_cfg; } +rocprofiler_pc_sampling_configuration_t +extract_pc_sampling_config_prefer_stochastic(rocprofiler_agent_id_t agent_id) +{ + return extract_pc_sampling_config_prefer(ROCPROFILER_PC_SAMPLING_METHOD_STOCHASTIC, agent_id); +} + +rocprofiler_pc_sampling_configuration_t +extract_pc_sampling_config_prefer_host_trap(rocprofiler_agent_id_t agent_id) +{ + return extract_pc_sampling_config_prefer(ROCPROFILER_PC_SAMPLING_METHOD_HOST_TRAP, agent_id); +} + void rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /*context_id*/, rocprofiler_buffer_id_t /*buffer_id*/, @@ -306,6 +322,41 @@ test_fail_because_service_is_already_configured( ROCPROFILER_STATUS_ERROR_SERVICE_ALREADY_CONFIGURED); } +/** + * @brief Current limitation - Stochastic and Host-Trap PC sampling cannot coexist + * on the same device simultaneously. + */ +void +test_fail_stochastic_vs_host_trap(const callback_data* cb_data, + rocprofiler_agent_id_t agent_id, + const rocprofiler_pc_sampling_configuration_t* picked_pcs_config) +{ + // Ensure that stochastic sampling has been configured on the device. + if(picked_pcs_config->method == ROCPROFILER_PC_SAMPLING_METHOD_STOCHASTIC) + { + // KFD is implemented in the way that if stochastic is configured, + // no host-trap configuration will be returned (and vice-versa). + // Thus, ensure that the following function, although prefers host-trap, + // returns stochastic. + auto still_stochastic_config = extract_pc_sampling_config_prefer_host_trap(agent_id); + EXPECT_EQ(still_stochastic_config.method, ROCPROFILER_PC_SAMPLING_METHOD_STOCHASTIC); + + constexpr uint64_t host_trap_interva_us = 1; + // Now, ensure that a user cannot still force rocprofiler-sdk and configure host-trap + // sampling on the device with configured stochastic sampling. + // ensure that stochastic and host trap sampling cannot coexist on the same device. + EXPECT_EQ( + rocprofiler_configure_pc_sampling_service(cb_data->client_ctx, + agent_id, + ROCPROFILER_PC_SAMPLING_METHOD_HOST_TRAP, + ROCPROFILER_PC_SAMPLING_UNIT_TIME, + host_trap_interva_us, + cb_data->client_buffer, + 0), + ROCPROFILER_STATUS_ERROR_SERVICE_ALREADY_CONFIGURED); + } +} + } // namespace TEST(pc_sampling, rocprofiler_configure_pc_sampling_service) @@ -388,6 +439,7 @@ TEST(pc_sampling, rocprofiler_configure_pc_sampling_service) "Failed to configure PC sampling service"); test_fail_because_service_is_already_configured(cb_data, agent_id, &pcs_config); + test_fail_stochastic_vs_host_trap(cb_data, agent_id, &pcs_config); // Cannot create PC sampling service in context different than the `cb_data->client_ctx` EXPECT_EQ(rocprofiler_configure_pc_sampling_service(another_ctx, diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp index 54c04ef894..1027ea3fe4 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp @@ -601,6 +601,7 @@ invoke_client_finalizer(rocprofiler_client_id_t client_id) hsa::async_copy_sync(); hsa::queue_controller_sync(); + pc_sampling::service_sync(); auto _fini_status = get_fini_status(); if(_fini_status == 0) set_fini_status(-1); @@ -726,6 +727,8 @@ finalize() #if ROCPROFILER_SDK_HSA_PC_SAMPLING > 0 // WARNING: this must precede `code_object::finalize()` pc_sampling::code_object::finalize(); + // WARNING: this must follows queue_controller_fini. + pc_sampling::service_fini(); #endif code_object::finalize(); context::correlation_id_finalize(); diff --git a/projects/rocprofiler-sdk/tests/pc_sampling/address_translation.cpp b/projects/rocprofiler-sdk/tests/pc_sampling/address_translation.cpp index efa1867c74..8c2709bf92 100644 --- a/projects/rocprofiler-sdk/tests/pc_sampling/address_translation.cpp +++ b/projects/rocprofiler-sdk/tests/pc_sampling/address_translation.cpp @@ -44,14 +44,13 @@ namespace { struct FlatProfiler { -public: FlatProfiler() = default; ~FlatProfiler() = default; - CodeobjAddressTranslate translator; - KernelObjectMap kernel_object_map; - FlatProfile flat_profile; - std::mutex global_mut; + CodeobjAddressTranslate translator = {}; + KernelObjectMap kernel_object_map = {}; + FlatProfile flat_profile = {}; + std::mutex global_mut = {}; }; } // namespace @@ -68,6 +67,7 @@ void fini() { delete flat_profiler; + flat_profiler = nullptr; } CodeobjAddressTranslate& @@ -186,15 +186,19 @@ dump_flat_profile() ss << "====================================\n" << std::endl; }); - ss << "The total number of decoded samples: " << samples_num << std::endl; - ss << "The total number of collected samples: " << client::pcs::total_samples_num() + ss << "The total number of valid decoded samples: " + << flat_profile.get_valid_decoded_samples_num() << std::endl; + ss << "The total number of invalid samples : " << flat_profile.get_invalid_samples_num() << std::endl; *utils::get_output_stream() << ss.str() << std::endl; - assert(samples_num == client::pcs::total_samples_num()); - // We expect at least one PC sample to be decoded/delivered; - assert(samples_num > 0); + utils::pcs_assert( + samples_num == flat_profile.get_valid_decoded_samples_num(), + "Number of collected valid samples different than the number of decoded samples."); + utils::pcs_assert(samples_num > 0, "No valid samples collected/decoded."); + utils::pcs_assert(flat_profile.more_valid_decoded_samples_expected(), + "More invalid samples observed."); } } // namespace address_translation diff --git a/projects/rocprofiler-sdk/tests/pc_sampling/address_translation.hpp b/projects/rocprofiler-sdk/tests/pc_sampling/address_translation.hpp index d352d64664..1895b26081 100644 --- a/projects/rocprofiler-sdk/tests/pc_sampling/address_translation.hpp +++ b/projects/rocprofiler-sdk/tests/pc_sampling/address_translation.hpp @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -47,8 +48,8 @@ using marker_id_t = rocprofiler::sdk::codeobj::disassembly::marker_i */ struct inst_id_t { - marker_id_t code_object_id; - uint64_t pc_addr; + marker_id_t code_object_id = 0; + uint64_t pc_addr = 0; bool operator==(const inst_id_t& b) const { @@ -97,12 +98,12 @@ public: uint64_t end_address() const { return end_address_; }; private: - mutable std::shared_mutex mut; - uint64_t code_object_id_; - std::string kernel_name_; - uint64_t begin_address_; - uint64_t end_address_; - std::vector> instructions_; + mutable std::shared_mutex mut = {}; + uint64_t code_object_id_ = 0; + std::string kernel_name_ = {}; + uint64_t begin_address_ = 0; + uint64_t end_address_ = 0; + std::vector> instructions_ = {}; }; class KernelObjectMap @@ -156,8 +157,8 @@ public: } private: - std::unordered_map> kernel_object_map; - mutable std::shared_mutex mut; + std::unordered_map> kernel_object_map = {}; + mutable std::shared_mutex mut = {}; std::string form_key(uint64_t code_object_id, std::string kernel_name, uint64_t begin_address) { @@ -206,14 +207,14 @@ public: uint64_t sample_count() const { return sample_count_; }; private: - mutable std::shared_mutex mut; + mutable std::shared_mutex mut = {}; // FIXME: prevent direct access of the following fields. // The following fields should be accessible only from within `process` function. - std::unique_ptr inst_; + std::unique_ptr inst_ = {}; // In case an instruction is samples with different exec masks, // keep track of how many time each exec_mask was observed. - std::map exec_mask_counts_; + std::map exec_mask_counts_ = {}; // How many time this instruction is samples uint64_t sample_count_ = 0; }; @@ -226,6 +227,8 @@ public: // write lock required void add_sample(std::unique_ptr instruction, uint64_t exec_mask) { + // counting valid decoded samples + valid_decoded_samples_num++; auto lock = std::unique_lock{mut}; inst_id_t inst_id = {.code_object_id = instruction->codeobj_id, @@ -256,10 +259,30 @@ public: return nullptr; } + void add_invalid_sample() + { + // counting invalid samples + invalid_decoded_samples_num++; + } + + /** + * @brief Verify that more valid decoded samples is generated. + */ + bool more_valid_decoded_samples_expected() const + { + return valid_decoded_samples_num > invalid_decoded_samples_num; + } + + uint64_t get_valid_decoded_samples_num() const { return valid_decoded_samples_num; } + + uint64_t get_invalid_samples_num() const { return invalid_decoded_samples_num; } + private: // TODO: optimize to use unordered_map - std::map> samples; - mutable std::shared_mutex mut; + std::map> samples = {}; + std::atomic valid_decoded_samples_num = {}; + std::atomic invalid_decoded_samples_num = {}; + mutable std::shared_mutex mut = {}; }; std::mutex& diff --git a/projects/rocprofiler-sdk/tests/pc_sampling/pcs.cpp b/projects/rocprofiler-sdk/tests/pc_sampling/pcs.cpp index 012586db89..c65315c9a8 100644 --- a/projects/rocprofiler-sdk/tests/pc_sampling/pcs.cpp +++ b/projects/rocprofiler-sdk/tests/pc_sampling/pcs.cpp @@ -53,6 +53,11 @@ using avail_configs_vec_t = std::vector>; using pc_sampling_buffer_id_vec_t = std::vector; +namespace +{ +constexpr uint64_t stochastic_interval = 1048576; // 2 ^ 20 cycles +} // namespace + struct tool_agent_info { rocprofiler_agent_id_t agent_id; @@ -79,16 +84,14 @@ public: } // GPU agents supporting PC sampling - tool_agent_info_vec_t gpu_agents; - // The total number of collected samples - std::atomic total_samples_num{0}; + tool_agent_info_vec_t gpu_agents = {}; // ROCProfiler-SDK PC sampling buffers - pc_sampling_buffer_id_vec_t buffer_ids; + pc_sampling_buffer_id_vec_t buffer_ids = {}; // The set that keeps track of reported code object loading/unloading events. // At the end of the test, the sets needs to be empty. // Namely, each loading event will insert a code object id into the set, // while each unloading event will delete a code ojbect id from the set. - code_object_id_set_t active_code_objects; + code_object_id_set_t active_code_objects = {}; }; // The reason for using raw pointers is the following. @@ -139,7 +142,7 @@ find_all_gpu_agents_supporting_pc_sampling_impl(rocprofiler_agent_version_t vers << "type=" << _agents[i]->type << "\n"; } - *utils::get_output_stream() << ss.str() << std::endl; + *utils::get_output_stream() << ss.str() << "\n"; return ROCPROFILER_STATUS_SUCCESS; } @@ -188,8 +191,8 @@ query_avail_configs_for_agent(tool_agent_info* agent_info) { // The query operation failed, so consider the PC sampling is unsupported at the agent. // This can happen if the PC sampling service is invoked within the ROCgdb. - ss << "Querying PC sampling capabilities failed with status: " << status << std::endl; - *utils::get_output_stream() << ss.str() << std::endl; + ss << "Querying PC sampling capabilities failed with status: " << status << "\n"; + *utils::get_output_stream() << ss.str() << "\n"; return false; } else if(agent_info->avail_configs->size() == 0) @@ -199,7 +202,8 @@ query_avail_configs_for_agent(tool_agent_info* agent_info) } ss << "The agent with the id: " << agent_info->agent_id.handle << " supports the " - << agent_info->avail_configs->size() << " configurations: " << std::endl; + << agent_info->avail_configs->size() << " configurations: " + << "\n"; size_t ind = 0; for(auto& cfg : *agent_info->avail_configs) { @@ -208,7 +212,11 @@ query_avail_configs_for_agent(tool_agent_info* agent_info) << "unit: " << cfg.unit << ", " << "min_interval: " << cfg.min_interval << ", " << "max_interval: " << cfg.max_interval << ", " - << "flags: " << std::hex << cfg.flags << std::dec << std::endl; + << "flags: " << std::hex << cfg.flags << std::dec + << ((cfg.flags == ROCPROFILER_PC_SAMPLING_CONFIGURATION_FLAGS_INTERVAL_POW2) + ? " (an interval value must be power of 2)" + : "") + << "\n"; } *utils::get_output_stream() << ss.str() << std::flush; @@ -221,8 +229,9 @@ configure_pc_sampling_prefer_stochastic(tool_agent_info* agent_info, rocprofiler_context_id_t context_id, rocprofiler_buffer_id_t buffer_id) { - int failures = MAX_FAILURES; - size_t interval = 0; + auto stochastic_picked = false; + int failures = MAX_FAILURES; + size_t interval = 0; do { // Update the list of available configurations @@ -245,9 +254,9 @@ configure_pc_sampling_prefer_stochastic(tool_agent_info* agent_info, { if(cfg.method == ROCPROFILER_PC_SAMPLING_METHOD_STOCHASTIC) { - // Temporarily disable stochastic sampling as it's not fully supported. - // first_stochastic_config = &cfg; - // break; + first_stochastic_config = &cfg; + stochastic_picked = true; + break; } else if(!first_host_trap_config && cfg.method == ROCPROFILER_PC_SAMPLING_METHOD_HOST_TRAP) @@ -260,7 +269,7 @@ configure_pc_sampling_prefer_stochastic(tool_agent_info* agent_info, const rocprofiler_pc_sampling_configuration_t* picked_cfg = (first_stochastic_config != nullptr) ? first_stochastic_config : first_host_trap_config; - interval = picked_cfg->min_interval; + interval = (stochastic_picked) ? stochastic_interval : picked_cfg->min_interval; auto status = rocprofiler_configure_pc_sampling_service(context_id, agent_info->agent_id, @@ -272,8 +281,10 @@ configure_pc_sampling_prefer_stochastic(tool_agent_info* agent_info, if(status == ROCPROFILER_STATUS_SUCCESS) { *utils::get_output_stream() - << ">>> We chose PC sampling interval: " << interval - << " on the agent: " << agent_info->agent->id.handle << std::endl; + << ">>> We chose " << (stochastic_picked ? "stochastic" : "Host-Trap") + << " PC sampling with the interval: " << interval << " " + << (stochastic_picked ? "clock-cycles" : "micro seconds") + << " on the agent: " << agent_info->agent->id.handle << "\n"; return; } else if(status != ROCPROFILER_STATUS_ERROR_NOT_AVAILABLE) @@ -301,6 +312,106 @@ configure_pc_sampling_prefer_stochastic(tool_agent_info* agent_info, "Failed too many times configuring PC sampling service"); } +template +void +print_sample_common_fields(std::ostream& os, const PcSamplingRecordT* pc_sample) +{ + os << "(code_obj_id, offset): (" << pc_sample->pc.code_object_id << ", 0x" << std::hex + << pc_sample->pc.code_object_offset << "), " + << "timestamp: " << std::dec << pc_sample->timestamp << ", " + << "exec: " << std::hex << std::setw(16) << pc_sample->exec_mask << ", " + << "workgroup_id_(x=" << std::dec << std::setw(5) << pc_sample->workgroup_id.x << ", " + << "y=" << std::setw(5) << pc_sample->workgroup_id.y << ", " + << "z=" << std::setw(5) << pc_sample->workgroup_id.z << "), " + << "wave_in_group: " << std::setw(2) << static_cast(pc_sample->wave_in_group) + << ", " + << "chiplet: " << std::setw(2) << static_cast(pc_sample->hw_id.chiplet) << ", " + << "dispatch_id: " << std::setw(7) << pc_sample->dispatch_id << "," + << "correlation: {internal=" << std::setw(7) << pc_sample->correlation_id.internal << ", " + << "external=" << std::setw(5) << pc_sample->correlation_id.external.value << "}, "; +} + +void +print_sample(std::ostream& os, const rocprofiler_pc_sampling_record_host_trap_v0_t* sample) +{ + print_sample_common_fields(os, sample); + os << "\n"; +} + +void +print_sample(std::ostream& os, const rocprofiler_pc_sampling_record_stochastic_v0_t* sample) +{ + print_sample_common_fields(os, sample); + + if(sample->wave_issued) + { + auto* inst_c_str = rocprofiler_get_pc_sampling_instruction_type_name( + static_cast(sample->inst_type)); + utils::pcs_assert(inst_c_str != nullptr, "Invalid instruction type"); + os << "wave issued " << std::string(inst_c_str) << " instruction, "; + } + else + { + auto* reason_c_str = rocprofiler_get_pc_sampling_instruction_not_issued_reason_name( + static_cast( + sample->snapshot.reason_not_issued)); + utils::pcs_assert(reason_c_str != nullptr, "Invalid not issued reason"); + os << "wave is stalled due to: " << std::string(reason_c_str) << " reason, "; + } + + auto snapshot = sample->snapshot; + os << "two VALU instructions issued: " << static_cast(snapshot.dual_issue_valu) + << ", "; + + os << "arbiter state: {pipe issued: (" + << "VALU: " << static_cast(snapshot.arb_state_issue_valu) << ", " + << "MATRIX: " << static_cast(snapshot.arb_state_issue_matrix) << ", " + << "LDS: " << static_cast(snapshot.arb_state_issue_lds) << ", " + << "LDS_DIRECT: " << static_cast(snapshot.arb_state_issue_lds_direct) << ", " + << "SCALAR: " << static_cast(snapshot.arb_state_issue_scalar) << ", " + << "TEX: " << static_cast(snapshot.arb_state_issue_vmem_tex) << ", " + << "FLAT: " << static_cast(snapshot.arb_state_issue_flat) << ", " + << "EXPORT: " << static_cast(snapshot.arb_state_issue_exp) << ", " + << "MISC: " << static_cast(snapshot.arb_state_issue_misc) << "), " + << "pipe stalled: (" + << "VALU: " << static_cast(snapshot.arb_state_stall_valu) << ", " + << "MATRIX: " << static_cast(snapshot.arb_state_stall_matrix) << ", " + << "LDS: " << static_cast(snapshot.arb_state_stall_lds) << ", " + << "LDS_DIRECT: " << static_cast(snapshot.arb_state_stall_lds_direct) << ", " + << "SCALAR: " << static_cast(snapshot.arb_state_stall_scalar) << ", " + << "TEX: " << static_cast(snapshot.arb_state_stall_vmem_tex) << ", " + << "FLAT: " << static_cast(snapshot.arb_state_stall_flat) << ", " + << "EXPORT: " << static_cast(snapshot.arb_state_stall_exp) << ", " + << "MISC: " << static_cast(snapshot.arb_state_stall_misc) << ")}"; + + os << "\n"; +} + +template +static inline void +process_sample(const PcSamplingRecordT* pc_sample, + address_translation::CodeobjAddressTranslate& translator, + address_translation::FlatProfile& flat_profile) +{ + // Ignore samples from blit kernels or self-modifying code. + if(pc_sample->correlation_id.internal == ROCPROFILER_CORRELATION_ID_INTERNAL_NONE) return; + + auto corr_id = pc_sample->correlation_id; + // Internal correlation IDs are generated by the ROCProfiler-SDK for + // kernel dispatches only. Similarly, the test tool generate external + // correlation IDs for the kernel dispatches only. + // Thus, we should expect them to be equal. + assert(corr_id.internal == corr_id.external.value); + assert(corr_id.external.value > 0); + + // Decoding the PC + auto inst = translator.get(pc_sample->pc.code_object_id, pc_sample->pc.code_object_offset); + flat_profile.add_sample(std::move(inst), pc_sample->exec_mask); + + // TODO: introduce checks specific to stochastic sampling + // TODO: print an instruction inside print_sample +} + void rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /*context_id*/, rocprofiler_buffer_id_t /*buffer_id*/, @@ -311,7 +422,7 @@ rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /*context_id*/, { std::stringstream ss; ss << "The number of delivered samples is: " << num_headers << ", " - << "while the number of dropped samples is: " << drop_count << std::endl; + << "while the number of dropped samples is: " << drop_count << "\n"; auto& flat_profile = client::address_translation::get_flat_profile(); auto& translator = client::address_translation::get_address_translator(); @@ -340,48 +451,26 @@ rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /*context_id*/, { auto* pc_sample = static_cast( cur_header->payload); - - ss << "(code_obj_id, offset): (" << pc_sample->pc.code_object_id << ", 0x" - << std::hex << pc_sample->pc.code_object_offset << "), " - << "timestamp: " << std::dec << pc_sample->timestamp << ", " - << "exec: " << std::hex << std::setw(16) << pc_sample->exec_mask << ", " - << "workgroup_id_(x=" << std::dec << std::setw(5) - << pc_sample->workgroup_id.x << ", " - << "y=" << std::setw(5) << pc_sample->workgroup_id.y << ", " - << "z=" << std::setw(5) << pc_sample->workgroup_id.z << "), " - << "wave_in_group: " << std::setw(2) - << static_cast(pc_sample->wave_in_group) << ", " - << "chiplet: " << std::setw(2) - << static_cast(pc_sample->hw_id.chiplet) << ", " - << "dispatch_id: " << std::setw(7) << pc_sample->dispatch_id << "," - << "correlation: {internal=" << std::setw(7) - << pc_sample->correlation_id.internal << ", " - << "external=" << std::setw(5) << pc_sample->correlation_id.external.value - << "}" << std::endl; - - // Ignore samples from blit kernels. - if(pc_sample->correlation_id.internal == - ROCPROFILER_CORRELATION_ID_INTERNAL_NONE) - continue; - - total_samples_num() += 1; - - auto corr_id = pc_sample->correlation_id; - // Internal correlation IDs are generated by the ROCProfiler-SDK for - // kernel dispatches only. Similarly, the test tool generate external - // correlation IDs for the kernel dispatches only. - // Thus, we should expect them to be equal. - assert(corr_id.internal == corr_id.external.value); - assert(corr_id.external.value > 0); - - // Decoding the PC - auto inst = translator.get(pc_sample->pc.code_object_id, - pc_sample->pc.code_object_offset); - flat_profile.add_sample(std::move(inst), pc_sample->exec_mask); + print_sample(ss, pc_sample); + process_sample(pc_sample, translator, flat_profile); + } + else if(cur_header->kind == ROCPROFILER_PC_SAMPLING_RECORD_STOCHASTIC_V0_SAMPLE) + { + auto* pc_sample = static_cast( + cur_header->payload); + print_sample(ss, pc_sample); + process_sample(pc_sample, translator, flat_profile); + } + else if(cur_header->kind == ROCPROFILER_PC_SAMPLING_RECORD_INVALID_SAMPLE) + { + // tracking number of invalid samples + flat_profile.add_invalid_sample(); } else { - assert(false); + std::cerr << "Unexpected kind of PC sampling record: " << cur_header->kind + << "\n"; + exit(-1); } } else @@ -391,7 +480,7 @@ rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /*context_id*/, } // TODO: do we need some sync here? - *utils::get_output_stream() << ss.str() << std::endl; + *utils::get_output_stream() << ss.str() << "\n"; } } } // namespace @@ -406,12 +495,7 @@ void fini() { delete pc_sampler; -} - -std::atomic& -total_samples_num() -{ - return pc_sampler->total_samples_num; + pc_sampler = nullptr; } void @@ -421,9 +505,11 @@ configure_pc_sampling_on_all_agents(rocprofiler_context_id_t context) if(pc_sampler->gpu_agents.empty()) { - *utils::get_output_stream() << "No availabe gpu agents supporting PC sampling" << std::endl; + *utils::get_output_stream() << "No availabe gpu agents supporting PC sampling" + << "\n"; // Emit the message to skip the test. - std::cerr << "PC sampling unavailable" << std::endl; + std::cerr << "PC sampling unavailable" + << "\n"; // Exit with no error if none of the GPUs support PC sampling. exit(0); } @@ -481,7 +567,8 @@ flush_and_destroy_buffers() if(status == ROCPROFILER_STATUS_ERROR_BUFFER_BUSY) { *utils::get_output_stream() - << "The buffer is busy, so we cannot destroy it at the moment." << std::endl; + << "The buffer is busy, so we cannot destroy it at the moment." + << "\n"; } else { diff --git a/projects/rocprofiler-sdk/tests/pc_sampling/pcs.hpp b/projects/rocprofiler-sdk/tests/pc_sampling/pcs.hpp index 1c8578a058..9414b94306 100644 --- a/projects/rocprofiler-sdk/tests/pc_sampling/pcs.hpp +++ b/projects/rocprofiler-sdk/tests/pc_sampling/pcs.hpp @@ -40,9 +40,6 @@ init(); void fini(); -std::atomic& -total_samples_num(); - void configure_pc_sampling_on_all_agents(rocprofiler_context_id_t context); diff --git a/projects/rocprofiler-sdk/tests/pc_sampling/utils.cpp b/projects/rocprofiler-sdk/tests/pc_sampling/utils.cpp index c0903527c5..dd866c2411 100644 --- a/projects/rocprofiler-sdk/tests/pc_sampling/utils.cpp +++ b/projects/rocprofiler-sdk/tests/pc_sampling/utils.cpp @@ -33,5 +33,19 @@ get_output_stream() static std::ostream* _v = nullptr; return _v; } + +/** + * @brief Shows @p error_msg and aborts if @p condition is false. + * + */ +void +pcs_assert(bool condition, std::string_view error_msg) +{ + if(!condition) + { + std::cerr << "PC Sampling Assertion Error: " << error_msg << "\n"; + abort(); + } +} } // namespace utils } // namespace client diff --git a/projects/rocprofiler-sdk/tests/pc_sampling/utils.hpp b/projects/rocprofiler-sdk/tests/pc_sampling/utils.hpp index 2ed697958d..4f5a7aad35 100644 --- a/projects/rocprofiler-sdk/tests/pc_sampling/utils.hpp +++ b/projects/rocprofiler-sdk/tests/pc_sampling/utils.hpp @@ -61,5 +61,8 @@ namespace utils { std::ostream*& get_output_stream(); -} + +void +pcs_assert(bool condition, std::string_view error_msg); +} // namespace utils } // namespace client diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/CMakeLists.txt b/projects/rocprofiler-sdk/tests/pytest-packages/CMakeLists.txt index 3af49e7b53..64b8e84851 100644 --- a/projects/rocprofiler-sdk/tests/pytest-packages/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/pytest-packages/CMakeLists.txt @@ -48,3 +48,5 @@ from __future__ import absolute_import from . import pytest_utils ") + +add_subdirectory(pc_sampling) diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/CMakeLists.txt b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/CMakeLists.txt new file mode 100644 index 0000000000..522cfaa403 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/CMakeLists.txt @@ -0,0 +1,18 @@ +# +# +# + +set(PACKAGE_OUTPUT_DIR + ${ROCPROFILER_SDK_TESTS_BINARY_DIR}/pytest-packages/rocprofiler_sdk/pc_sampling) + +file( + WRITE "${PACKAGE_OUTPUT_DIR}/__init__.py" + "# +from __future__ import absolute_import + +from . import exec_mask_manipulation +") + +add_subdirectory(exec_mask_manipulation) +add_subdirectory(stochastic) +add_subdirectory(transpose_multiple_agents) diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/exec_mask_manipulation/CMakeLists.txt b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/exec_mask_manipulation/CMakeLists.txt new file mode 100644 index 0000000000..b993b56025 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/exec_mask_manipulation/CMakeLists.txt @@ -0,0 +1,14 @@ +# +# +# + +set(PACKAGE_OUTPUT_DIR + ${ROCPROFILER_SDK_TESTS_BINARY_DIR}/pytest-packages/rocprofiler_sdk/pc_sampling/exec_mask_manipulation + ) + +set(PC_SAMPLING_PYTHON_SOURCES __init__.py csv.py json.py) + +foreach(_FILE ${PC_SAMPLING_PYTHON_SOURCES}) + configure_file(${CMAKE_CURRENT_LIST_DIR}/${_FILE} ${PACKAGE_OUTPUT_DIR}/${_FILE} + COPYONLY) +endforeach() diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/exec_mask_manipulation/__init__.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/exec_mask_manipulation/__init__.py new file mode 100644 index 0000000000..d94763ee7d --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/exec_mask_manipulation/__init__.py @@ -0,0 +1,23 @@ +# MIT License +# +# Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import absolute_import diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/exec_mask_manipulation/csv.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/exec_mask_manipulation/csv.py new file mode 100644 index 0000000000..8b42d90664 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/exec_mask_manipulation/csv.py @@ -0,0 +1,210 @@ +# MIT License +# +# Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from __future__ import absolute_import + +import numpy as np +import pandas as pd + + +def stochastic_assert(df, df_condition_selection, max_failing_samples=10): + # TODO: When asserting certain conditions related to exec_masks for all samples, + # we observe some failures. + # This usually happens because some small number of samples (e.g., 1-10 out of 100k) + # do not satisfy the condition. This is either a regression in the ROCr 2nd level trap + # handler (as sometimes execution mask or correlation ID mismatches), or + # just stochastic nature of the sampling (meaning our checks are too strict). + # To relax checks, we introduce an assertion that will allow some small number + # of samples to disobey the condition. + # This is a temporary solution until we find the root cause of the issue. + + # extract the failing samples + failing_samples = df[~df_condition_selection] + assert len(failing_samples) <= max_failing_samples, "Too many failing samples" + + +# Keep this in case we decide to revert workgroup_id information +def validate_workgoup_id_x_y_z(df, max_x, max_y, max_z): + assert (df["Workgroup_Size_X"].astype(int) >= 0).all() + assert (df["Workgroup_Size_X"].astype(int) <= max_x).all() + + assert (df["Workgroup_Size_Y"].astype(int) >= 0).all() + assert (df["Workgroup_Size_Y"].astype(int) <= max_y).all() + + assert (df["Workgroup_Size_Z"].astype(int) >= 0).all() + assert (df["Workgroup_Size_Z"].astype(int) <= max_z).all() + + +# Keep this in case we decide to revert wave_id information +def validate_wave_id(df, max_wave_id): + assert (df["Wave_Id"].astype(int) <= max_wave_id).all() + + +# Keep this in case we decide to revert wave_id information +def validate_chiplet(df, max_chiplet): + assert (df["Chiplet"].astype(int) <= max_chiplet).all() + + +def validate_instruction_decoding( + df, + inst_str, + exec_mask_uint64: np.uint64 = None, + source_code_lines_range: (int, int) = None, + all_source_lines_samples=False, +): + # Make a copy, so that we don't work (modify) a view. + df_inst = df[df["Instruction"].apply(lambda inst: inst.startswith(inst_str))].copy() + + assert not df_inst.empty + # assert the exec mask if requested + if exec_mask_uint64 is not None: + stochastic_assert( + df_inst, df_inst["Exec_Mask"].astype(np.uint64) == exec_mask_uint64 + ) + + # assert whether the samples source code lines belongs to the provided range + if source_code_lines_range is not None: + start_range, end_range = source_code_lines_range + # The instruction comment is isually in the following format: /path/to/source/file.cpp:line_num + df_inst["source_line_num"] = df_inst["Instruction_Comment"].apply( + lambda source_line: int(source_line.split(":")[-1]) + ) + assert (df_inst["source_line_num"] >= start_range).all() + assert (df_inst["source_line_num"] <= end_range).all() + # if requested, check if all lines from the range are sampled + if all_source_lines_samples: + assert len(df_inst["source_line_num"].unique()) == ( + end_range - start_range + 1 + ) + + +def validate_instruction_comment(df): + # Instruction comment must always be present, since the testing application + # is built with debug symbols. + assert ( + (df["Instruction_Comment"] != "") & (df["Instruction_Comment"] != "nullptr") + ).all() + + +def validate_instruction_correlation_id_relation(df): + # Samples with no decoded instructions originates from either + # blit kernels or self modifying code. The correlation id for this + # type of samples should alway be zero. + # Thus, Correlation_Id is 0 `iff`` instruction is not decoded. + + # The previous statement has two implications. + # Implication 1: If the instruction is not decoded, then correlation id is 0. + samples_no_instruction_df = df[ + (df["Instruction"] == "") | (df["Instruction"] == "nullptr") + ] + assert (samples_no_instruction_df["Correlation_Id"] == 0).all() + + # Implication 2: If the correlation id is 0, then the instruction is not decoded. + samples_cid_zero_df = df[df["Correlation_Id"] == 0] + assert ( + (samples_cid_zero_df["Instruction"] == "") + | (samples_cid_zero_df["Instruction"] == "nullptr") + ).all() + + assert len(samples_no_instruction_df) == len(samples_cid_zero_df) + + # Since we're not enabling any kind of API tracing, + # internal correlation id should match the dispatch id + assert all(df["Correlation_Id"] == df["Dispatch_Id"]) + + +def validate_exec_mask_based_on_correlation_id(df): + # The function assumes that each kernel launches 1024 blocks. + # Each block contains number of threads that matches correlation ID of the kernel. + # The exec mask of a sample should contain number of ones equal to + # the correlation ID of the kernel during which execution the sample was generated. + df["active_SIMD_threads"] = df["Exec_Mask"].apply( + lambda exec_mask: bin(exec_mask).count("1") + ) + stochastic_assert(df, df["active_SIMD_threads"] == df["Correlation_Id"]) + + # TODO: Comment out the following code if it causes spurious fails. + # The more conservative constraint based on the experience follows. + # The exec mask of sampled instructions of the kernels respect the following pattern: + # cid -> exec + # 1 -> 0b1 + # 2 -> 0b11 + # 3 -> 0b111 + # ... + # 64 -> 0xffffffffffffffff + + df["Exec_Mask2"] = ( + df["Correlation_Id"].astype(int).apply(lambda x: int("0b" + (x * "1"), 2)) + ) + + # TODO: exec should be in hex and that will ease the comparison + stochastic_assert( + df, df["Exec_Mask"].astype(np.uint64) == df["Exec_Mask2"].astype(np.uint64) + ) + + +def exec_mask_manipulation_validate_csv(df, all_sampled=False): + assert not df.empty + + validate_instruction_comment(df) + validate_instruction_correlation_id_relation(df) + + # Validate samples with non-zero correlation IDs (and with decoded instructions) + samples_cid_non_zero_df = df[df["Correlation_Id"] != 0] + + # exactly 65 kernels and 65 correlation id + assert (samples_cid_non_zero_df["Correlation_Id"].astype(int) >= 1).all() + assert (samples_cid_non_zero_df["Correlation_Id"].astype(int) <= 65).all() + if all_sampled: + # all correlation IDs must be sampled + assert len(samples_cid_non_zero_df["Correlation_Id"].astype(int).unique()) == 65 + + first_64_kernels_df = samples_cid_non_zero_df[ + samples_cid_non_zero_df["Correlation_Id"] <= 64 + ] + + # Make a copy, so that we don't work (modify) a view. + validate_exec_mask_based_on_correlation_id(first_64_kernels_df.copy()) + + # validate the last kernel + kernel_65_df = df[df["Correlation_Id"] == 65] + + # assert that v_rcp instructions are properly decoded + # the v_rcp is executed by even SIMD threads + validate_instruction_decoding( + kernel_65_df, + "v_rcp_f64", + exec_mask_uint64=np.uint64(int("5555555555555555", 16)), + source_code_lines_range=(288, 387), + all_source_lines_samples=all_sampled, + ) + + # assert that v_rcp_f32 instructions are properly decoded + # the v_rcp_f32 is executed by odd SIMD threads + validate_instruction_decoding( + kernel_65_df, + "v_rcp_f32", + exec_mask_uint64=np.uint64(int("AAAAAAAAAAAAAAAA", 16)), + source_code_lines_range=(391, 490), + all_source_lines_samples=all_sampled, + ) diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/exec_mask_manipulation/json.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/exec_mask_manipulation/json.py new file mode 100644 index 0000000000..5794f4a024 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/exec_mask_manipulation/json.py @@ -0,0 +1,244 @@ +# MIT License +# +# Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from __future__ import absolute_import + +import numpy as np +import pandas as pd + + +def validate_json_exec_mask_manipulation( + data_json, pc_sampling_method="host_trap", all_sampled=False +): + # Although functional programming might look more elegant, + # I was trying to avoid multiple iteration over the list of samples. + # Thus, I decided to use procedural programming instead. + # Although, it would be more elegant to wrap some of the checks in dedicated functions, + # I noticed that it can introduce significant overhead, so I decided to inline those checks. + + # the function assume homogenous system + agents = data_json["agents"] + gpu_agents = list(filter(lambda agent: agent["type"] == 2, agents)) + # There should be at least one GPU agent + assert len(gpu_agents) > 0 + first_gpu_agent = gpu_agents[0] + num_xcc = first_gpu_agent["num_xcc"] + max_waves_per_simd = first_gpu_agent["max_waves_per_simd"] + simd_per_cu = first_gpu_agent["simd_per_cu"] + + instructions = data_json["strings"]["pc_sample_instructions"] + comments = data_json["strings"]["pc_sample_comments"] + + # execution mask where even SIMD lanes are active + # correspond to the v_rcp_f64 instructions of the last kernel + even_simds_active_exec_mask = np.uint64(int("5555555555555555", 16)) + # start and end source code lines of the v_rcp_f64 instructions of the last kernel + v_rcp_f64_start_line_num, v_rcp_f64_end_line_num = 288, 387 + # execution mask where even SIMD lanes are active + # correspond to the v_rcp_f64 instructions of the last kernel + odd_simds_active_exec_mask = np.uint64(int("AAAAAAAAAAAAAAAA", 16)) + # start and end source code lines of the v_rcp_f32 0 instructions of the last kernel + v_rcp_f32_start_line_num, v_rcp_f32_end_line_num = 391, 490 + + # sampled wave_ids of the last kernel + kernel65_sampled_wave_in_grp = set() + # sampled source lines of the last kernel matching v_rcp_f64 instructions + kernel65_v_rcp_64_sampled_source_line_set = set() + # sampled source lines of the last kernel matching v_rcp_f64 instructions + kernel65_v_rcp_f32_sampled_source_line_set = set() + # sampled correlation IDs + sampled_cids_set = set() + # pairs of sampled SIMD ids and waveslot IDs + sampled_simd_waveslots_pairs = set() + # sampled chiplets + sampled_chiplets = set() + # sample VMIDs + sampled_vmids = set() + # TODO: Similar reason for introducing stochastic_assert inside the csv.py. + # When asserting certain conditions related to exec_masks for all samples, + # we observe some failures. + # This usually happens because some small number of samples (e.g., 1-10 out of 100k) + # do not satisfy the condition. This is either a regression in the ROCr 2nd level trap + # handler (as sometimes execution mask or correlation ID mismatches), or + # just stochastic nature of the sampling (meaning our checks are too strict). + # To relax checks, we introduce an assertion that will allow some small number + # of samples to disobey the condition. + # This is a temporary solution until we find the root cause of the issue. + + failing_exec_mask_checks_samples_num = 0 + # We noticed failing samples in: + # 1. kernels 1-64 + # 2. kernel 65 even SIMD lanes + # 3. kernel 64 odd SIMD lanes + # The number of failing samples is less than 10 per category. + max_number_of_failing_records = 30 + + for sample in data_json["buffer_records"][f"pc_sample_{pc_sampling_method}"]: + record = sample["record"] + cid = record["corr_id"]["internal"] + + # pull information from hw_id + hw_id = record["hw_id"] + sampled_chiplets.add(hw_id["chiplet"]) + sampled_simd_waveslots_pairs.add((hw_id["simd_id"], hw_id["wave_id"])) + sampled_vmids.add(hw_id["vm_id"]) + + # Checks specific for all samples + + # cids must be non-negative numbers + assert cid >= 0 + + inst_index = sample["inst_index"] + + # Since we're not enabling any kind of API tracing, the internal correlation id should + # be equal to the dispatch_id + assert cid == record["dispatch_id"] + + if cid == 0: + # Samples originates either from a blit kernel or self-modifying code. + # Thus, code object is uknown, as well as the instruction. + assert record["pc"]["code_object_id"] == 0 + assert inst_index == -1 + else: + # Update set of sampled cids + sampled_cids_set.add(cid) + + # All samples with non-zero correlation ID should pass the following checks + # code object is know, so as the instruction + assert record["pc"]["code_object_id"] != 0 + assert inst_index != -1 + + wgid = record["wrkgrp_id"] + # check corrdinates of the workgroup + assert wgid["x"] >= 0 and wgid["x"] <= 1023 + assert wgid["y"] == 0 + assert wgid["z"] == 0 + + wave_in_grp = record["wave_in_grp"] + exec_mask = record["exec_mask"] + + if cid < 65: + # checks specific for samples from first 64 kernels + assert wave_in_grp == 0 + # inline if possible + # validate_json_exec_mask_based_on_cid(sample.record) + + # The function assumes that each kernel launches 1024 blocks. + # Each block contains number of threads that matches correlation ID of the kernel. + # The exec mask of a sample should contain number of ones equal to + # the correlation ID of the kernel during which execution the sample was generated. + # assert bin(exec_mask).count("1") == cid + if bin(exec_mask).count("1") != cid: + failing_exec_mask_checks_samples_num += 1 + + # TODO: Comment out the following code if it causes spurious fails. + # The more conservative constraint based on the experience follows. + # The exec mask of sampled instructions of the kernels respect the following pattern: + # cid -> exec + # 1 -> 0b1 + # 2 -> 0b11 + # 3 -> 0b111 + # ... + # 64 -> 0xffffffffffffffff + exec_mask_str = "0b" + "1" * cid + # assert np.uint64(exec_mask) == np.uint64(int(exec_mask_str, 2)) + if np.uint64(exec_mask) != np.uint64(int(exec_mask_str, 2)): + failing_exec_mask_checks_samples_num += 1 + else: + # No more that 65 cids + assert cid == 65 + # Monitor wave_in_group being sampled + kernel65_sampled_wave_in_grp.add(wave_in_grp) + # chekcs specific for samples from the last kernel + assert wave_in_grp >= 0 and wave_in_grp <= 3 + + # validate instruction decoding + inst = instructions[inst_index] + comm = comments[inst_index] + # The instruction comment is isually in the following format: + # /path/to/source/file.cpp:line_num + line_num = int(comm.split(":")[-1]) + if inst.startswith("v_rcp_f64"): + # even SIMD lanes active + # assert np.uint64(exec_mask) == even_simds_active_exec_mask + if np.uint64(exec_mask) != even_simds_active_exec_mask: + failing_exec_mask_checks_samples_num += 1 + + assert ( + line_num >= v_rcp_f64_start_line_num + and line_num <= v_rcp_f64_end_line_num + ) + kernel65_v_rcp_64_sampled_source_line_set.add(line_num) + elif inst.startswith("v_rcp_f32"): + # odd SIMD lanes active + # assert np.uint64(exec_mask) == odd_simds_active_exec_mask + if np.uint64(exec_mask) != odd_simds_active_exec_mask: + failing_exec_mask_checks_samples_num += 1 + + assert ( + line_num >= v_rcp_f32_start_line_num + and line_num <= v_rcp_f32_end_line_num + ) + kernel65_v_rcp_f32_sampled_source_line_set.add(line_num) + + if all_sampled: + # All cids that belongs to the range [1, 65] should be samples + assert len(sampled_cids_set) == 65 + + # all wave_ids that belongs to the range [0, 3] should be sampled for the last kernel + assert len(kernel65_sampled_wave_in_grp) == 4 + + # all source lines matches v_rcp_f64 instructions of the last kernel should be sampled + assert len(kernel65_v_rcp_64_sampled_source_line_set) == ( + v_rcp_f64_end_line_num - v_rcp_f64_start_line_num + 1 + ) + # all source lines matches v_rcp_f32 instructions of the last kernel should be sampled + assert len(kernel65_v_rcp_f32_sampled_source_line_set) == ( + v_rcp_f32_end_line_num - v_rcp_f32_start_line_num + 1 + ) + + # all chiplets must be sampled + assert len(sampled_chiplets) == num_xcc + # all (simd ID, waveslot ID) pairs must be samples + assert len(sampled_simd_waveslots_pairs) == simd_per_cu * max_waves_per_simd + + # assert chiplet index + assert all(map(lambda chiplet: 0 <= chiplet < num_xcc, sampled_chiplets)) + # assert (SIMD ID, waveslot ID) combinations + assert all( + map( + lambda simd_waveslot: (0 <= simd_waveslot[0] < simd_per_cu) + and (0 <= simd_waveslot[1] < max_waves_per_simd), + sampled_simd_waveslots_pairs, + ) + ) + + # Apparently, not all dispatches must belong to the same VMID, + # so I'm temporarily disabling the following check. + # # all samples should belong to the same VMID + # assert len(sampled_vmids) == 1 + + # assert that the number of failing samples is acceptable + assert ( + failing_exec_mask_checks_samples_num <= max_number_of_failing_records + ), "Number of failing samples failing exec_mask check is too high" diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/CMakeLists.txt b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/CMakeLists.txt new file mode 100644 index 0000000000..67f0c13e57 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/CMakeLists.txt @@ -0,0 +1,17 @@ +# +# +# + +set(PACKAGE_OUTPUT_DIR + ${ROCPROFILER_SDK_TESTS_BINARY_DIR}/pytest-packages/rocprofiler_sdk/pc_sampling/stochastic + ) + +set(PC_SAMPLING_PYTHON_SOURCES __init__.py) + +foreach(_FILE ${PC_SAMPLING_PYTHON_SOURCES}) + configure_file(${CMAKE_CURRENT_LIST_DIR}/${_FILE} ${PACKAGE_OUTPUT_DIR}/${_FILE} + COPYONLY) +endforeach() + +add_subdirectory(csv) +add_subdirectory(json) diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/__init__.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/__init__.py new file mode 100644 index 0000000000..f28a7b13c3 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/__init__.py @@ -0,0 +1,24 @@ +# MIT License +# +# Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from __future__ import absolute_import diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/CMakeLists.txt b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/CMakeLists.txt new file mode 100644 index 0000000000..74ba8f4c32 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/CMakeLists.txt @@ -0,0 +1,16 @@ +# +# +# + +set(PACKAGE_OUTPUT_DIR + ${ROCPROFILER_SDK_TESTS_BINARY_DIR}/pytest-packages/rocprofiler_sdk/pc_sampling/stochastic/csv + ) + +set(PC_SAMPLING_PYTHON_SOURCES __init__.py) + +foreach(_FILE ${PC_SAMPLING_PYTHON_SOURCES}) + configure_file(${CMAKE_CURRENT_LIST_DIR}/${_FILE} ${PACKAGE_OUTPUT_DIR}/${_FILE} + COPYONLY) +endforeach() + +add_subdirectory(gfx9) diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/__init__.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/__init__.py new file mode 100644 index 0000000000..8e876b72cd --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/__init__.py @@ -0,0 +1,24 @@ +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from __future__ import absolute_import diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/CMakeLists.txt b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/CMakeLists.txt new file mode 100644 index 0000000000..2d2edf9141 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/CMakeLists.txt @@ -0,0 +1,18 @@ +# +# +# + +set(PACKAGE_OUTPUT_DIR + ${ROCPROFILER_SDK_TESTS_BINARY_DIR}/pytest-packages/rocprofiler_sdk/pc_sampling/stochastic/csv/gfx9 + ) + +set(PC_SAMPLING_PYTHON_SOURCES + __init__.py valu_instructions.py matrix_instructions.py texture_instructions.py + flat_instructions.py lds_instructions.py) + +foreach(_FILE ${PC_SAMPLING_PYTHON_SOURCES}) + configure_file(${CMAKE_CURRENT_LIST_DIR}/${_FILE} ${PACKAGE_OUTPUT_DIR}/${_FILE} + COPYONLY) +endforeach() + +add_subdirectory(s_instructions) diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/__init__.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/__init__.py new file mode 100644 index 0000000000..2daac780cd --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/__init__.py @@ -0,0 +1,110 @@ +# MIT License +# +# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from __future__ import absolute_import + +import numpy as np +import pandas as pd + +from .s_instructions import validate_s_instructions +from .valu_instructions import validate_valu_instructions +from .texture_instructions import validate_texture_instructions +from .matrix_instructions import validate_matrix_instructions +from .lds_instructions import validate_lds_instructions +from .flat_instructions import validate_flat_instructions + + +def validate_wave_count(df): + # Validating number of actives waves on a cu + assert ( + (df["Wave_Count"] >= 1) & (df["Wave_Count"] <= 32) + ).all(), "Invalid Wave_Count" + + +def validate_issued_instruction_type_no_inst(samples): + # NO_INST type of instructions means instruction is not issued + issued_type_no_inst = samples[samples["Instruction_Type"] == "NO_INST"] + assert len(issued_type_no_inst) == 0, "NO_INST implies no instruction is issued" + + +def validate_issued_instruction_type_other(samples): + # OTHER type of instructions still to be determined + issued_type_other = samples[samples["Instruction_Type"] == "OTHER"] + assert len(issued_type_other) == 0, "OTHER type of instruction observed first time" + + +def validate_issued_instruction_type_lds_direct(samples): + # LDS_DIRECT type of instructions do not exist on gfx9 + issued_type_lds_direct = samples[samples["Instruction_Type"] == "LDS_DIRECT"] + assert ( + len(issued_type_lds_direct) == 0 + ), "LDS direct type of instruction observed on GFX9" + + +def validate_issued_instruction_type_dual_valu(samples): + # LDS_DIRECT type of instructions do not exist on gfx9 + issued_type_dual_valu = samples[samples["Instruction_Type"] == "DUAL_VALU"] + assert ( + len(issued_type_dual_valu) == 0 + ), "DUAL_VALU type of instruction observed on GFX9" + + +# TODO: add checks for missing instruction types +# - export + + +def validate_stochastic_samples_csv(df: pd.DataFrame): + # We expect mode valid than invalid samples + # TODO: use stats for comparing valid vs invalid samples + # invalid_samples = df[df["Valid"] == False] + # valid_samples = df[df["Valid"]].copy() + # assert len(valid_samples) > len(invalid_samples) + + # only valid samples reside in df + valid_samples = df.copy() + + validate_wave_count(valid_samples) + + # The following checks assumes that we were able to decode + # the instruction, meaning a code object and dispatch must be known. + valid_samples = valid_samples[valid_samples["Dispatch_Id"] > 0] + + # scalar, barrier, waitcnt, jump, message, branches (taken and not taken) + # are handled inside `validate_s_instructions` function + validate_s_instructions(valid_samples) + validate_valu_instructions(valid_samples) + validate_texture_instructions(valid_samples) + validate_matrix_instructions(valid_samples) + validate_lds_instructions(valid_samples) + validate_flat_instructions(valid_samples) + + # validating issued instructions for uncovered types + valid_samples_issued = valid_samples[ + valid_samples["Wave_Issued_Instruction"] == True + ].copy() + validate_issued_instruction_type_no_inst(valid_samples_issued) + validate_issued_instruction_type_other(valid_samples_issued) + + # The following two types of instructions should not be observed on gfx9 + validate_issued_instruction_type_lds_direct(valid_samples_issued) + validate_issued_instruction_type_dual_valu(valid_samples_issued) diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/flat_instructions.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/flat_instructions.py new file mode 100644 index 0000000000..6fa9d3a3a3 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/flat_instructions.py @@ -0,0 +1,74 @@ +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from __future__ import absolute_import + + +def validate_flat_instructions_issued(samples_issued): + # issued instruction with type == FLAT -> instruction starts with either flat_ or global_ + issued_type_flat = samples_issued[ + samples_issued["Instruction_Type"] + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_FLAT" + ] + assert ( + issued_type_flat["Instruction"] + .apply(lambda x: x.startswith("flat_") or x.startswith("global_")) + .all() + ) + + # if issued instruction starts with global_ or flat_ -> its type must be FLAT + issued_flat_or_global = samples_issued[ + samples_issued["Instruction"].apply( + lambda x: x.startswith("flat_") or x.startswith("global_") + ) + ] + assert ( + issued_flat_or_global["Instruction_Type"] + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_FLAT" + ).all() + + +def validate_flat_instructions_stalled(samples): + global_flat_regex = r"^(global|flat)_" + flat_samples = samples[samples["Instruction"].str.match(global_flat_regex)] + flat_stalled = flat_samples[flat_samples["Wave_Issued_Instruction"] == False] + + assert ( + flat_stalled["Stall_Reason"] + .apply( + lambda x: x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_WIN_EX_STALL" + or x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NO_INSTRUCTION_AVAILABLE" + or x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_NOT_WIN" + or x == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ALU_DEPENDENCY" + ) + .all() + ) + + +def validate_flat_instructions(samples): + samples_issued = samples[samples["Wave_Issued_Instruction"]] + validate_flat_instructions_issued(samples_issued) + validate_flat_instructions_stalled(samples) diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/lds_instructions.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/lds_instructions.py new file mode 100644 index 0000000000..670cd1a58a --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/lds_instructions.py @@ -0,0 +1,67 @@ +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from __future__ import absolute_import + + +def validate_lds_instructions_issued(samples_issued): + # issued instruction with type == LDS -> instruction starts with ds_ + issued_type_lds = samples_issued[ + samples_issued["Instruction_Type"] + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_LDS" + ] + assert issued_type_lds["Instruction"].apply(lambda x: x.startswith("ds_")).all() + + # issued instruction starts with ds_ -> it must be LDS + issued_ds = samples_issued[ + samples_issued["Instruction"].apply(lambda x: x.startswith("ds_")) + ] + assert ( + issued_ds["Instruction_Type"] == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_LDS" + ).all() + + +def validate_lds_instructions_stalled(samples): + lds_samples = samples[samples["Instruction"].apply(lambda x: x.startswith("ds_"))] + lds_stalled = lds_samples[lds_samples["Wave_Issued_Instruction"] == False] + + # TODO: question - why we observed alu_dependency on matrix_multiply_tile kernel + assert ( + lds_stalled["Stall_Reason"] + .apply( + lambda x: x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_WIN_EX_STALL" + or x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NO_INSTRUCTION_AVAILABLE" + or x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_NOT_WIN" + or x == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ALU_DEPENDENCY" + ) + .all() + ) + + +def validate_lds_instructions(samples): + samples_issued = samples[samples["Wave_Issued_Instruction"]] + validate_lds_instructions_issued(samples_issued) + validate_lds_instructions_stalled(samples) diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/matrix_instructions.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/matrix_instructions.py new file mode 100644 index 0000000000..d82166cde0 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/matrix_instructions.py @@ -0,0 +1,107 @@ +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from __future__ import absolute_import + + +def validate_matrix_instructions_issued(samples_issued): + # issued instruction with type == MATRIX -> instruction starts with v_mfma + issued_type_matrix = samples_issued[ + samples_issued["Instruction_Type"] + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_MATRIX" + ] + assert issued_type_matrix["Instruction"].apply(lambda x: x.startswith("v_mfma")).all() + # v_mfma_f32 goes through Matrix (MAI) arbiter, while v_mfma_f64 goes through the VALU arbiter + + # SGEMM goes through Matrix (MAI arbiter) + v_mfma_f32_issued = samples_issued[ + samples_issued["Instruction"].apply(lambda x: x.startswith("v_mfma_f32")) + ] + assert ( + v_mfma_f32_issued["Instruction_Type"] + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_MATRIX" + ).all() + + # DGEMM goes through VALU arbiter + v_mfma_f64_issued = samples_issued[ + samples_issued["Instruction"].apply(lambda x: x.startswith("v_mfma_f64")) + ] + assert (v_mfma_f64_issued["Instruction_Type"] == "MATRIX").all() + assert len(issued_type_matrix) == len(v_mfma_f32_issued) + len(v_mfma_f64_issued) + + # TODO: find an example with MAI instructions + + +def validate_dgemm_matrix_instructions_stalled(samples): + v_mfma_f64_samples = samples[ + samples["Instruction"].apply(lambda x: x.startswith("v_mfma_f64")) + ] + v_mfma_f64_stalled = v_mfma_f64_samples[ + v_mfma_f64_samples["Wave_Issued_Instruction"] == False + ] + + assert ( + v_mfma_f64_stalled["Stall_Reason"] + .apply( + lambda x: x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_WIN_EX_STALL" + or x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NO_INSTRUCTION_AVAILABLE" + or x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_NOT_WIN" + ) + .all() + ) + + +def validate_sgemm_matrix_instructions_stalled(samples): + v_mfma_f32_samples = samples[ + samples["Instruction"].apply(lambda x: x.startswith("v_mfma_f32")) + ] + v_mfma_f32_stalled = v_mfma_f32_samples[ + v_mfma_f32_samples["Wave_Issued_Instruction"] == False + ] + assert ( + v_mfma_f32_stalled["Stall_Reason"] + .apply( + lambda x: x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_WIN_EX_STALL" + or x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NO_INSTRUCTION_AVAILABLE" + or x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_NOT_WIN" + ) + .all() + ) + + +def validate_matrix_instructions_stalled(samples): + validate_dgemm_matrix_instructions_stalled(samples) + validate_sgemm_matrix_instructions_stalled(samples) + # TODO" find an example to test this + + +def validate_matrix_instructions(samples): + samples_issued = samples[samples["Wave_Issued_Instruction"]] + validate_matrix_instructions_issued(samples_issued) + validate_matrix_instructions_stalled(samples) diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/CMakeLists.txt b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/CMakeLists.txt new file mode 100644 index 0000000000..7542162120 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/CMakeLists.txt @@ -0,0 +1,23 @@ +# +# +# + +set(PACKAGE_OUTPUT_DIR + ${ROCPROFILER_SDK_TESTS_BINARY_DIR}/pytest-packages/rocprofiler_sdk/pc_sampling/stochastic/csv/gfx9/s_instructions + ) + +set(PC_SAMPLING_PYTHON_SOURCES + __init__.py + branch_instructions.py + waitcnt.py + other_instructions.py + scalar_instructions.py + internal_instructions.py + jump_instructions.py + message_instructions.py + barrier_instructions.py) + +foreach(_FILE ${PC_SAMPLING_PYTHON_SOURCES}) + configure_file(${CMAKE_CURRENT_LIST_DIR}/${_FILE} ${PACKAGE_OUTPUT_DIR}/${_FILE} + COPYONLY) +endforeach() diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/__init__.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/__init__.py new file mode 100644 index 0000000000..335b797807 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/__init__.py @@ -0,0 +1,151 @@ +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from __future__ import absolute_import + +from functools import partial + +from .branch_instructions import validate_branch_instructions +from .waitcnt import validate_waitcnt +from .other_instructions import validate_other_instructions +from .scalar_instructions import validate_scalar_instructions +from .internal_instructions import validate_internal_instructions +from .jump_instructions import validate_jump_instructions +from .message_instructions import validate_message_instructions +from .barrier_instructions import validate_barrier_instructions + + +# Using Prefix Tree to classify the instruction type +# I did this instead of the regex becuase I wanted to try if we could +# generalize this approach for other types of instructions. +# The dream scenario: We have a giant list of all instructions and their +# types. Then we parse the list and dynamically determine the checks +# based on the instruction types. + + +# TODO: extract this outside of the file +class TrieNode: + def __init__(self): + self.children = {} + self.instruction_type = None # Store the instruction type at the leaf node + + +class PrefixTree: + def __init__(self): + self.root = TrieNode() + + def insert(self, full_prefix, instruction_type): + """Insert a prefix and its associated instruction type into the Trie.""" + node = self.root + for char in full_prefix: + if char not in node.children: + node.children[char] = TrieNode() + node = node.children[char] + node.instruction_type = ( + instruction_type # Assign the instruction type at the leaf + ) + + def get_instruction_type(self, instruction): + """Get the list of instruction types based on the longest matching prefix.""" + node = self.root + matched_types = [] # List to store matched types + + # Traverse the instruction one character at a time + for char in instruction: + if char not in node.children: + break # Stop if no match is found + + node = node.children[char] + + # If we reach a node that has an instruction type, store it + if node.instruction_type: + matched_types.append(node.instruction_type) + + return matched_types + + +instructions_with_types = [ + ("s_", "SCALAR"), # Scalar instructions (general category) + ("s_waitcnt", "WAITCNT"), # WAITCNT (specific) + ("s_sendmsg", "MESSAGE"), # MESSAGE (specific) + ("s_barrier", "BARRIER"), # BARRIER (specifix) + ("s_swappc", "JUMP"), # JUMP (specific) + ("s_setpc", "JUMP"), # JUMP + ("s_setpc", "JUMP"), # JUMP + ("s_sleep", "JUMP"), # JUMP + ("s_branch", "BRANCH"), # BRANCH + ("s_cbranch", "BRANCH"), # BRANCH (conditional) + ("s_wakeup", "OTHER"), # OHTER + ("s_nop", "INTERNAL"), # INTERNAL + ("s_sleep", "INTERNAL"), # INTERNAL +] + + +inst_type_verify_functions = { + "BRANCH": validate_branch_instructions, + "WAITCNT": validate_waitcnt, + "OTHER": validate_other_instructions, + "SCALAR": validate_scalar_instructions, + "INTERNAL": validate_internal_instructions, + "JUMP": validate_jump_instructions, + "MESSAGE": validate_message_instructions, + "BARRIER": validate_barrier_instructions, +} + + +# Function to classify instructions based on the Trie +def classify_instruction_by_prefix(prefix_tree, instruction): + # extracting the base of the instruction (e.g., s_mov_*, v_mov_*, s_setpc_*, ...) + base_instruction = instruction.split()[0] + + # Classify based on the Trie (general classification) + instruction_types = prefix_tree.get_instruction_type(base_instruction) + + # aways use the specific type + return instruction_types[-1] + + +def enforce_type_inheritance(sub_df, parent_df): + for col in parent_df.columns: + sub_df[col] = sub_df[col].astype(parent_df[col].dtype) + return sub_df + + +def validate_s_instructions(df): + s_instructions = df[df["Instruction"].apply(lambda x: x.startswith("s_"))].copy() + + # fill in the Prefi Tree + prefix_tree = PrefixTree() + for prefix, instruction_type in instructions_with_types: + prefix_tree.insert(prefix, instruction_type) + + _classify_instruction_by_prefix = partial(classify_instruction_by_prefix, prefix_tree) + s_instructions["Instruction_Type_From_Name"] = s_instructions["Instruction"].apply( + _classify_instruction_by_prefix + ) + + for inst_type, subframe in s_instructions.groupby("Instruction_Type_From_Name"): + # subframe = enforce_type_inheritance(subframe, s_instructions) + if inst_type in inst_type_verify_functions: + # Pass all samples and filtered samples to the verification function. + inst_type_verify_functions[inst_type](df, subframe) diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/barrier_instructions.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/barrier_instructions.py new file mode 100644 index 0000000000..8a4dd0f08e --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/barrier_instructions.py @@ -0,0 +1,65 @@ +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from __future__ import absolute_import + + +def validate_barrier_instructions_issued(all_samples, barrier_samples): + barrier_type_samples_issued = all_samples[ + all_samples["Wave_Issued_Instruction"] + & ( + all_samples["Instruction_Type"] + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BARRIER" + ) + ] + + barrier_samples_issued = barrier_samples[barrier_samples["Wave_Issued_Instruction"]] + # sanity check + assert len(barrier_type_samples_issued) == len(barrier_samples_issued) + # repeat checks from above + assert ( + barrier_samples_issued["Instruction_Type"] + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BARRIER" + ).all() + + +def validate_barrier_instructions_stalled(barrier_samples): + barrier_samples_stalled = barrier_samples[ + barrier_samples["Wave_Issued_Instruction"] == False + ] + assert ( + barrier_samples_stalled["Stall_Reason"] + .apply( + lambda x: x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NO_INSTRUCTION_AVAILABLE" + or x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_NOT_WIN" + or x == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_BARRIER_WAIT" + ) + .all() + ) + + +def validate_barrier_instructions(all_samples, barrier_samples): + validate_barrier_instructions_issued(all_samples, barrier_samples) + validate_barrier_instructions_stalled(barrier_samples) diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/branch_instructions.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/branch_instructions.py new file mode 100644 index 0000000000..a29426a5b3 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/branch_instructions.py @@ -0,0 +1,142 @@ +# MIT License +# +# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from __future__ import absolute_import + + +def validate_issued_instruction_type_branch_taken(samples): + # issued instruction with type BRANCH_TAKEN -> instruction starts with either s_cbranch or s_branch + issued_type_branch_taken = samples[ + ( + samples["Instruction_Type"] + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BRANCH_TAKEN" + ) + & samples["Wave_Issued_Instruction"] + ] + assert ( + issued_type_branch_taken["Instruction"] + .apply(lambda x: x.startswith("s_branch") or x.startswith("s_cbranch")) + .all() + ) + assert issued_type_branch_taken["Wave_Issued_Instruction"].all() + + # if issued instruction starts with s_branch (unconditional branch) -> its type must be BRANCH_TAKEN + issued_s_branch = samples[ + samples["Instruction"].apply(lambda x: x.startswith("s_branch")) + & samples["Wave_Issued_Instruction"] + ] + assert ( + issued_s_branch["Instruction_Type"] + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BRANCH_TAKEN" + ).all() + + # see `validate_issued_instruction_type_branch_not_taken` for more info about s_cbranch checks + + +def validate_issued_instruction_type_branch_not_taken(samples): + # issued instruction with type BRANCH_NOT_TAKEN -> instruction is conditional branch (starts s_cbranch) + issued_type_branch_not_taken = samples[ + ( + samples["Instruction_Type"] + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BRANCH_NOT_TAKEN" + ) + & samples["Wave_Issued_Instruction"] + ] + assert ( + issued_type_branch_not_taken["Instruction"] + .apply(lambda x: x.startswith("s_cbranch")) + .all() + ) + assert issued_type_branch_not_taken["Wave_Issued_Instruction"].all() + + # if issued instruction starts with s_cbranch -> its type is either BRANCH_TAKEN on BRANCH_NOT_TAKEN + issued_s_cbranch = samples[ + samples["Instruction"].apply(lambda x: x.startswith("s_cbranch")) + & samples["Wave_Issued_Instruction"] + ] + assert ( + ( + issued_s_cbranch["Instruction_Type"] + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BRANCH_TAKEN" + ) + | ( + issued_s_cbranch["Instruction_Type"] + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BRANCH_NOT_TAKEN" + ) + ).all() + + +def s_branch_not_issued(stalled_samples): + s_branch_stalled = stalled_samples[ + stalled_samples["Instruction"].apply(lambda x: x.startswith("s_branch")) + ] + + if len(s_branch_stalled) > 0: + # No ALUDEP nor ARBWINEXSTALL observed so far for unconditional branches + assert ( + s_branch_stalled["Stall_Reason"] + .apply( + lambda x: x + != "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ALU_DEPENDENCY" + and x + != "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_WIN_EX_STALL" + ) + .all() + ) + + +def validate_stalled_branches(samples): + stalled_samples = samples[samples["Wave_Issued_Instruction"] == False] + + assert ( + stalled_samples["Stall_Reason"] + .apply( + lambda x: x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ALU_DEPENDENCY" + or x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NO_INSTRUCTION_AVAILABLE" + or x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_NOT_WIN" + or x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_WIN_EX_STALL" + ) + .all() + ) + + # Further constraints for unconditional branches + s_branch_not_issued(stalled_samples) + + +def validate_branch_instructions(all_samples, branch_samples): + """ + Use all_samples to verify the ROCProfV3 determines `Instruction_Type` field properly. + + Use filtered_samples to verify both issued and stalled branch instructions. + """ + # For the issued branches, use all samples, as the called functions will do + # separation based on branch type (conditional or unconditional) + validate_issued_instruction_type_branch_taken(all_samples) + validate_issued_instruction_type_branch_not_taken(all_samples) + + # stalled branches + validate_stalled_branches(branch_samples) diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/internal_instructions.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/internal_instructions.py new file mode 100644 index 0000000000..b91875800a --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/internal_instructions.py @@ -0,0 +1,38 @@ +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from __future__ import absolute_import + + +def validate_internal_instructions(all_samples, internal_samples): + assert (internal_samples["Wave_Issued_Instruction"] == False).all() + assert ( + internal_samples["Stall_Reason"] + .apply( + lambda x: x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_INTERNAL_INSTRUCTION" + or x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NO_INSTRUCTION_AVAILABLE" + ) + .all() + ) diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/jump_instructions.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/jump_instructions.py new file mode 100644 index 0000000000..ff0da196bf --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/jump_instructions.py @@ -0,0 +1,60 @@ +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from __future__ import absolute_import + + +def validate_jump_instructions_issued(all_samples, jump_samples): + jump_type_samples_issued = all_samples[ + all_samples["Wave_Issued_Instruction"] + & ( + all_samples["Instruction_Type"] + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_JUMP" + ) + ] + + jump_samples_issued = jump_samples[jump_samples["Wave_Issued_Instruction"]] + # sanity check + assert len(jump_type_samples_issued) == len(jump_samples_issued) + # repeat checks from above + assert ( + jump_samples_issued["Instruction_Type"] + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_JUMP" + ).all() + + +def validate_jump_instructions_stalled(jump_samples): + jump_samples_stalled = jump_samples[jump_samples["Wave_Issued_Instruction"] == False] + assert ( + jump_samples_stalled["Stall_Reason"] + .apply( + lambda x: x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NO_INSTRUCTION_AVAILABLE" + ) + .all() + ) + + +def validate_jump_instructions(all_samples, jump_samples): + validate_jump_instructions_issued(all_samples, jump_samples) + validate_jump_instructions_stalled(jump_samples) diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/message_instructions.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/message_instructions.py new file mode 100644 index 0000000000..626d114b37 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/message_instructions.py @@ -0,0 +1,64 @@ +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from __future__ import absolute_import + + +def validate_message_instructions_issued(all_samples, message_samples): + message_type_samples_issued = all_samples[ + all_samples["Wave_Issued_Instruction"] + & ( + all_samples["Instruction_Type"] + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_MESSAGE" + ) + ] + + message_samples_issued = message_samples[message_samples["Wave_Issued_Instruction"]] + # sanity check + assert len(message_type_samples_issued) == len(message_samples_issued) + # repeat checks from above + assert ( + message_samples_issued["Instruction_Type"] + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_MESSAGE" + ).all() + # TODO: find an example with messages + + +def validate_message_instructions_stalled(message_samples): + message_samples_stalled = message_samples[ + message_samples["Wave_Issued_Instruction"] == False + ] + assert ( + message_samples_stalled["Stall_Reason"] + .apply( + lambda x: x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NO_INSTRUCTION_AVAILABLE" + or x == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ALU_DEPENDENCY" + ) + .all() + ) + + +def validate_message_instructions(all_samples, message_samples): + validate_message_instructions_issued(all_samples, message_samples) + validate_message_instructions_stalled(message_samples) diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/other_instructions.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/other_instructions.py new file mode 100644 index 0000000000..42110ce077 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/other_instructions.py @@ -0,0 +1,64 @@ +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from __future__ import absolute_import + + +def validate_other_instructions_issued(all_samples, other_samples): + other_type_samples_issued = all_samples[ + all_samples["Wave_Issued_Instruction"] + & ( + all_samples["Instruction_Type"] + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_OTHER" + ) + ] + + other_samples_issued = other_samples[other_samples["Wave_Issued_Instruction"]] + assert ( + other_samples_issued["Instruction_Type"] + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_OTHER" + ).all() + + assert len(other_type_samples_issued) == len(other_samples_issued) + + +def validate_other_instructions_stalled(other_samples): + other_samples_stalled = other_samples[ + other_samples["Wave_Issued_Instruction"] == False + ] + + assert ( + other_samples_stalled["Stall_Reason"] + .apply( + lambda x: x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NO_INSTRUCTION_AVAILABLE" + or x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_NOT_WIN" + ) + .all() + ) + + +def validate_other_instructions(all_samples, filtered_samples): + validate_other_instructions_issued(all_samples, filtered_samples) + validate_other_instructions_stalled(filtered_samples) diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/scalar_instructions.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/scalar_instructions.py new file mode 100644 index 0000000000..129e2f2716 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/scalar_instructions.py @@ -0,0 +1,70 @@ +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from __future__ import absolute_import + + +def validate_scalar_instructions_issued(all_samples, scalar_samples): + # From all samples, extract samples with SCALAR type + scalar_type_samples_issued = all_samples[ + ( + all_samples["Instruction_Type"] + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_SCALAR" + ) + & all_samples["Wave_Issued_Instruction"] + ] + + # scalar_samples contains instructions starting with `s_` + scalar_samples_issued = scalar_samples[scalar_samples["Wave_Issued_Instruction"]] + # sanity check + assert len(scalar_type_samples_issued) == len(scalar_samples_issued) + # same checks as above + assert ( + scalar_samples_issued["Instruction_Type"] + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_SCALAR" + ).all() + + +def validate_scalar_instructions_stalled(scalar_samples): + scalar_samples_stalled = scalar_samples[ + scalar_samples["Wave_Issued_Instruction"] == False + ] + + assert ( + scalar_samples_stalled["Stall_Reason"] + .apply( + lambda x: x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NO_INSTRUCTION_AVAILABLE" + or x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_WIN_EX_STALL" + or x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_NOT_WIN" + or x == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ALU_DEPENDENCY" + ) + .all() + ) + + +def validate_scalar_instructions(all_samples, scalar_samples): + validate_scalar_instructions_issued(all_samples, scalar_samples) + validate_scalar_instructions_stalled(scalar_samples) diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/waitcnt.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/waitcnt.py new file mode 100644 index 0000000000..66e76dd617 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/s_instructions/waitcnt.py @@ -0,0 +1,45 @@ +# MIT License +# +# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from __future__ import absolute_import + + +def validate_waitcnt(all_samples, waitcnt_samples): + s_waitcnt_samples = all_samples[ + all_samples["Instruction"].apply(lambda x: x.startswith("s_waitcnt")) + ] + # sanity check + assert len(s_waitcnt_samples) == len(waitcnt_samples) + + # `s_waitcnt` instructions are never issued on GFX9 + assert (waitcnt_samples["Wave_Issued_Instruction"] == False).all() + # accepted stall reasons are + assert ( + waitcnt_samples["Stall_Reason"] + .apply( + lambda x: x == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_WAITCNT" + or x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NO_INSTRUCTION_AVAILABLE" + ) + .all() + ) diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/texture_instructions.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/texture_instructions.py new file mode 100644 index 0000000000..09819dbe88 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/texture_instructions.py @@ -0,0 +1,74 @@ +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from __future__ import absolute_import + + +def validate_texture_instructions_issued(samples_issued): + # issued instruction with type == TEX -> instruction starts with buffer_ + issued_type_texture = samples_issued[ + samples_issued["Instruction_Type"] + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_TEX" + ] + assert ( + issued_type_texture["Instruction"].apply(lambda x: x.startswith("buffer_")).all() + ) + + # issued instruction starts with buffer_ -> it must be TEX + issued_buffer = samples_issued[ + samples_issued["Instruction"].apply(lambda x: x.startswith("buffer_")) + ] + assert ( + issued_buffer["Instruction_Type"] + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_TEX" + ).all() + # TODO: find an example with TEX instructions + + +def validate_texture_instructions_stalled(samples): + texture_samples = samples[ + samples["Instruction"].apply(lambda x: x.startswith("buffer")) + ] + texture_stalled = texture_samples[texture_samples["Wave_Issued_Instruction"] == False] + + assert ( + texture_stalled["Stall_Reason"] + .apply( + lambda x: x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_WIN_EX_STALL" + or x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NO_INSTRUCTION_AVAILABLE" + or x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_NOT_WIN" + or x == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ALU_DEPENDENCY" + ) + .all() + ) + + # TODO: find an example with texture instructions + + +def validate_texture_instructions(samples): + samples_issued = samples[samples["Wave_Issued_Instruction"]] + validate_texture_instructions_issued(samples_issued) + validate_texture_instructions_stalled(samples) diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/valu_instructions.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/valu_instructions.py new file mode 100644 index 0000000000..80fd954562 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/csv/gfx9/valu_instructions.py @@ -0,0 +1,69 @@ +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from __future__ import absolute_import + + +def validate_valu_instructions_issued(samples_issued): + # issued instruction with type == VALU -> instruction starts with v_ + issued_type_valu = samples_issued[ + samples_issued["Instruction_Type"] + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_VALU" + ] + assert issued_type_valu["Instruction"].apply(lambda x: x.startswith("v_")).all() + + # issued instruction starts with v_ and is not matrix instruction -> it must be VALU + issued_v = samples_issued[ + samples_issued["Instruction"].apply( + lambda x: x.startswith("v_") and ("mfma" not in x) + ) + ] + assert ( + issued_v["Instruction_Type"] == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_VALU" + ).all() + + +def validate_valu_instructions_stalled(samples): + valu_samples = samples[ + samples["Instruction"].apply(lambda x: x.startswith("v_") and ("mfma" not in x)) + ] + valu_stalled = valu_samples[valu_samples["Wave_Issued_Instruction"] == False] + + assert ( + valu_stalled["Stall_Reason"] + .apply( + lambda x: x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_WIN_EX_STALL" + or x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NO_INSTRUCTION_AVAILABLE" + or x + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_NOT_WIN" + ) + .all() + ) + + +def validate_valu_instructions(samples): + samples_issued = samples[samples["Wave_Issued_Instruction"]] + validate_valu_instructions_issued(samples_issued) + validate_valu_instructions_stalled(samples) diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/CMakeLists.txt b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/CMakeLists.txt new file mode 100644 index 0000000000..38b9edc9fc --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/CMakeLists.txt @@ -0,0 +1,16 @@ +# +# +# + +set(PACKAGE_OUTPUT_DIR + ${ROCPROFILER_SDK_TESTS_BINARY_DIR}/pytest-packages/rocprofiler_sdk/pc_sampling/stochastic/json + ) + +set(PC_SAMPLING_PYTHON_SOURCES __init__.py) + +foreach(_FILE ${PC_SAMPLING_PYTHON_SOURCES}) + configure_file(${CMAKE_CURRENT_LIST_DIR}/${_FILE} ${PACKAGE_OUTPUT_DIR}/${_FILE} + COPYONLY) +endforeach() + +add_subdirectory(gfx9) diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/__init__.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/__init__.py new file mode 100644 index 0000000000..8e876b72cd --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/__init__.py @@ -0,0 +1,24 @@ +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from __future__ import absolute_import diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/gfx9/CMakeLists.txt b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/gfx9/CMakeLists.txt new file mode 100644 index 0000000000..c3c5c5fc31 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/gfx9/CMakeLists.txt @@ -0,0 +1,17 @@ +# +# +# + +set(PACKAGE_OUTPUT_DIR + ${ROCPROFILER_SDK_TESTS_BINARY_DIR}/pytest-packages/rocprofiler_sdk/pc_sampling/stochastic/json/gfx9 + ) + +set(PC_SAMPLING_PYTHON_SOURCES __init__.py arbiter_state.py s_instructions.py + other_instructions.py) + +foreach(_FILE ${PC_SAMPLING_PYTHON_SOURCES}) + configure_file(${CMAKE_CURRENT_LIST_DIR}/${_FILE} ${PACKAGE_OUTPUT_DIR}/${_FILE} + COPYONLY) +endforeach() + +# add_subdirectory(s_instructions) diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/gfx9/__init__.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/gfx9/__init__.py new file mode 100644 index 0000000000..7f71185f36 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/gfx9/__init__.py @@ -0,0 +1,176 @@ +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from __future__ import absolute_import + +import numpy as np +import pandas as pd +from collections import defaultdict +from .arbiter_state import validate_arbiter_state +from .other_instructions import ( + validate_valu_instructions, + validate_flat_instructions, + validate_lds_instructions, +) +from .s_instructions import ( + validate_internal_instructions, + validate_barrier_instructions, + validate_waitcnt, + validate_branch_instructions, + validate_scalar_instructions, +) + +# Using Prefix Tree to classify the instruction type +# I did this instead of the regex becuase I wanted to try if we could +# generalize this approach for other types of instructions. +# The dream scenario: We have a giant list of all instructions and their +# types. Then we parse the list and dynamically determine the checks +# based on the instruction types. + + +# TODO: extract this outside of the file +class TrieNode: + def __init__(self): + self.children = {} + self.instruction_type = None # Store the instruction type at the leaf node + + +class PrefixTree: + def __init__(self): + self.root = TrieNode() + + def insert(self, full_prefix, instruction_type): + """Insert a prefix and its associated instruction type into the Trie.""" + node = self.root + for char in full_prefix: + if char not in node.children: + node.children[char] = TrieNode() + node = node.children[char] + node.instruction_type = ( + instruction_type # Assign the instruction type at the leaf + ) + + def get_instruction_type(self, instruction): + """Get the list of instruction types based on the longest matching prefix.""" + node = self.root + matched_types = [] # List to store matched types + + # Traverse the instruction one character at a time + for char in instruction: + if char not in node.children: + break # Stop if no match is found + + node = node.children[char] + + # If we reach a node that has an instruction type, store it + if node.instruction_type: + matched_types.append(node.instruction_type) + + return matched_types + + +instructions_with_types = [ + ("s_", "SCALAR"), # Scalar instructions (general category) + ("s_waitcnt", "WAITCNT"), # WAITCNT (specific) + ("s_sendmsg", "MESSAGE"), # MESSAGE (specific) + ("s_barrier", "BARRIER"), # BARRIER (specifix) + ("s_swappc", "JUMP"), # JUMP (specific) + ("s_setpc", "JUMP"), # JUMP + ("s_setpc", "JUMP"), # JUMP + ("s_sleep", "JUMP"), # JUMP + ("s_branch", "BRANCH"), # BRANCH + ("s_cbranch", "BRANCH"), # BRANCH (conditional) + ("s_wakeup", "OTHER"), # OHTER + ("s_nop", "INTERNAL"), # INTERNAL + ("s_sleep", "INTERNAL"), # INTERNAL + ("v_", "VALU"), # VALU + ("v_mfma", "MATRIX"), # MATRIX + ("flat_", "FLAT"), # FLAT + ("global_", "FLAT"), # FLAT + ("ds_", "LDS"), # LDS + ("buffer_", "TEX"), # TEX +] + + +inst_type_verify_functions = { + "BRANCH": validate_branch_instructions, + "WAITCNT": validate_waitcnt, + # "OTHER": validate_other_instructions, + "SCALAR": validate_scalar_instructions, + "INTERNAL": validate_internal_instructions, + # "JUMP": validate_jump_instructions, + # "MESSAGE": validate_message_instructions, + "BARRIER": validate_barrier_instructions, + "VALU": validate_valu_instructions, + "FLAT": validate_flat_instructions, + "LDS": validate_lds_instructions, +} + + +def validate_stochastic_samples_json(data_json): + # fill in the Prefix Tree + prefix_tree = PrefixTree() + for prefix, instruction_type in instructions_with_types: + prefix_tree.insert(prefix, instruction_type) + + instructions = data_json["strings"]["pc_sample_instructions"] + comments = data_json["strings"]["pc_sample_comments"] + + insts_per_prefix_type = defaultdict(list) + + for sample in data_json["buffer_records"]["pc_sample_stochastic"]: + inst_index = sample["inst_index"] + if inst_index == -1: + # Ignoring samples from blit kernels + continue + record = sample["record"] + # extend the record with the instruction + record["inst"] = instructions[inst_index] + + # get the instruction type from prefix tree + inst_prefix_types = prefix_tree.get_instruction_type(record["inst"]) + # each type must have a type + assert len(inst_prefix_types) > 0 + # As more then one type can be matched, we take the last one as the most specific. + inst_prefix_type = inst_prefix_types[-1] + insts_per_prefix_type[inst_prefix_type].append(record) + + # For each sample, we need to validate wave_cnt and arbiter state + wave_cnt = record["wave_cnt"] + assert wave_cnt >= 0 and wave_cnt <= 32, "Invalid wave count" + + # arbiter state check + snapshot = record["snapshot"] + validate_arbiter_state(snapshot) + + # Check now the instruction type and arb state correlation. + # We do that for all samples of a single instruction type all at once + # to minimize the number of functions calls (one call for all samples, instead of a function + # call per sample). + # Please note that each sample is iterated at most twice. + # The first time to group samples per instruction type, and the second time to validate samples. + for inst_prefix_type, sample_records in insts_per_prefix_type.items(): + if inst_prefix_type in inst_type_verify_functions: + inst_type_verify_functions[inst_prefix_type](sample_records) + else: + assert False, f"Unhandle instruction type: {inst_prefix_type}" diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/gfx9/arbiter_state.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/gfx9/arbiter_state.py new file mode 100644 index 0000000000..f5d8bf7855 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/gfx9/arbiter_state.py @@ -0,0 +1,104 @@ +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from __future__ import absolute_import + + +def validate_arbiter_state(snapshot): + # VALU pipe checks + if snapshot["dual_issue_valu"]: + # (valu_issue = 1 & valu_stall = 0) is the only allowed + assert ( + snapshot["arb_state_issue_valu"] == 1 + and snapshot["arb_state_stall_valu"] == 0 + ), "Dual issue VALU arbiter state check failed" + else: + # (valu_issue = 0 & value_stall = 1) is not allowed + assert not ( + snapshot["arb_state_issue_valu"] == 0 + and snapshot["arb_state_stall_valu"] == 1 + ), "VALU arbiter state check failed" + + # Matrix pipe checks + # matrix_issue = 0 & matrix_stall = 1 is not allowed + assert not ( + snapshot["arb_state_issue_matrix"] == 0 + and snapshot["arb_state_stall_matrix"] == 1 + ), "Matrix arbiter state check failed" + + # scalar pipe checks + # scalar_issue = 0 & scalar_stall = 1 is not allowed + assert not ( + snapshot["arb_state_issue_scalar"] == 0 + and snapshot["arb_state_stall_scalar"] == 1 + ), "Scalar arbiter state check failed" + + # texture pipe checks + # tex_issue = 0 & tex_stall = 1 is not allowed + assert not ( + snapshot["arb_state_issue_vmem_tex"] == 0 + and snapshot["arb_state_stall_vmem_tex"] == 1 + ), "Texture arbiter state check failed" + + # LDS pipe checks + # lds_issue = 0 & lds_stall = 1 is not allowed + assert not ( + snapshot["arb_state_issue_lds"] == 0 and snapshot["arb_state_stall_lds"] == 1 + ), "LDS arbiter state check failed" + + # flat pipe checks + # flat_issue = 0 & flat_stall = 1 is not allowed + assert not ( + snapshot["arb_state_issue_flat"] == 0 and snapshot["arb_state_stall_flat"] == 1 + ), "Flat arbiter state check failed" + + # misc pipe checks + # TODO: verify this + # According to Joe's slides, the misc_stall cannot be 0. + # However, the condition representing this case fails for `transpose` application + # assert((samples['Arbiter_State_Stall_Misc'] == 0).all()) + # Instead, I had to replace is with the condition belowe + # misc_issue = 0 & misc_stall = 1 is not allowed + assert not ( + snapshot["arb_state_issue_misc"] == 0 and snapshot["arb_state_stall_misc"] == 1 + ), "Misc arbiter state check failed" + + # export pipe checks + # We assume same conditions for Export pipe as for Misc (Joe's original), + # so we should TODO: verify + # exp_issue can take both 1 and 0, so no need to check it + # exp_stall must be 0 + assert snapshot["arb_state_stall_exp"] == 0, "Export arbiter state check failed" + + # lds_direct pipe checks + # This pipe doesn't exist on GFX9 so both issue and stall must be 0 + assert ( + snapshot["arb_state_issue_lds_direct"] == 0 + ), "LDS Direct arbiter state check failed" + assert ( + snapshot["arb_state_stall_lds_direct"] == 0 + ), "LDS Direct arbiter state check failed" + + # brmsg pipe doesn't exist on GFX9 so both issue and stall must be 0 + assert snapshot["arb_state_issue_brmsg"] == 0, "BRMSG arbiter state check failed" + assert snapshot["arb_state_stall_brmsg"] == 0, "BRMSG arbiter state check failed" diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/gfx9/other_instructions.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/gfx9/other_instructions.py new file mode 100644 index 0000000000..ac626fc425 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/gfx9/other_instructions.py @@ -0,0 +1,160 @@ +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from __future__ import absolute_import + + +def validate_valu_instructions(sample_records): + allowed_stall_reasons = set( + [ + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_WIN_EX_STALL", + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NO_INSTRUCTION_AVAILABLE", + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_NOT_WIN", + ] + ) + for record in sample_records: + assert record["inst"].startswith("v_"), "VALU instruction must start with 'v_'" + + snapshot = record["snapshot"] + if record["wave_issued"] == 1: + # wave issued a VALU instruction + assert record["inst_type"] == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_VALU" + assert snapshot["arb_state_issue_valu"] == 1 + assert snapshot["arb_state_stall_valu"] == 0 + else: + # wave did not issue a VALU instruction + # inst_type is not relevant + stall_reason = snapshot["stall_reason"] + assert ( + stall_reason in allowed_stall_reasons + ), "Invalid stall reason for VALU instruction" + + if ( + stall_reason + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_WIN_EX_STALL" + ): + assert snapshot["arb_state_issue_valu"] == 1 + # Expectation would be that the `arb_state_stall_valu` is 1, but in some examples, + # I've observed different behavior. + + if ( + stall_reason + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_NOT_WIN" + ): + assert ( + snapshot["arb_state_issue_valu"] == 1 + or snapshot["arb_state_stall_matrix"] == 1 + ), "VALU or Matrix instruction should be issued" + + +def validate_flat_instructions(sample_records): + allowed_stall_reasons = set( + [ + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_WIN_EX_STALL", + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NO_INSTRUCTION_AVAILABLE", + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_NOT_WIN", + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ALU_DEPENDENCY", + ] + ) + for record in sample_records: + assert record["inst"].startswith("flat_") or record["inst"].startswith( + "global_" + ), "Invalid name of FLAT instruction" + + snapshot = record["snapshot"] + if record["wave_issued"] == 1: + # wave issued a flat memory instruction + assert ( + record["inst_type"] == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_FLAT" + ), "Invalid instruction type for FLAT instruction" + assert snapshot["arb_state_issue_flat"] == 1, "Arbiter issued flat" + assert ( + snapshot["arb_state_stall_flat"] == 0 + ), "Arbiter should not stalled flat" + + # TODO: add checks when flat stalls LDS, and vice versa + # If global_ inst, check ISSUE_FLAT=1, STALL_FLAT=0, ISSUE_LDS=1 -> STALL_LDS = 1 + else: + # wave did not issue a flat instruction + # inst_type is not relevant + stall_reason = snapshot["stall_reason"] + assert ( + stall_reason in allowed_stall_reasons + ), "Invalid stall reason for flat instruction" + + if ( + stall_reason + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_WIN_EX_STALL" + ): + assert snapshot["arb_state_issue_flat"] == 1, "Arbiter issued flat" + assert snapshot["arb_state_stall_flat"] == 1, "EX stalled flat" + + # In case of flat instructions, ARBITER_NOT_WIN might mean that + # the FLAT/VMEM pipe was idle, so the flat instruction is issued to the arbiter + # to wake up the clock in FLAT/VMEM, but cannot be issued to the execution pipeline. + # Afterwards, the same instruction is reissued to the arbiter that sends it to the execution pipeline. + # That's why `Arbiter_State_Issue_Flat` is not always true as in some other cases. + + +def validate_lds_instructions(sample_records): + allowed_stall_reasons = set( + [ + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_WIN_EX_STALL", + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NO_INSTRUCTION_AVAILABLE", + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_NOT_WIN", + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ALU_DEPENDENCY", + ] + ) + for record in sample_records: + assert record["inst"].startswith("ds_"), "Invalid name of LDS instruction" + + snapshot = record["snapshot"] + if record["wave_issued"] == 1: + # wave issued an LDS memory instruction + assert ( + record["inst_type"] == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_LDS" + ), "Invalid instruction type for LDS instruction" + assert snapshot["arb_state_issue_lds"] == 1, "Arbiter issued lds" + assert snapshot["arb_state_stall_lds"] == 0, "EX should not stalled lds" + + # TODO: add checks when LDS stalls flat, and vice versa + # ISSUE_LDS=1, STALL_LDS=0, ISSUE_FLAT=1 -> STALL_FLAT = 1 + else: + # wave did not issue an LDS instruction + # inst_type is not relevant + stall_reason = snapshot["stall_reason"] + assert ( + stall_reason in allowed_stall_reasons + ), "Invalid stall reason for LDS instruction" + + if ( + stall_reason + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_WIN_EX_STALL" + ): + assert snapshot["arb_state_issue_lds"] == 1, "Arbiter issued flat" + assert snapshot["arb_state_stall_lds"] == 1, "EX stalled flat" + elif ( + stall_reason + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_NOT_WIN" + ): + assert snapshot["arb_state_issue_lds"] == 1, "Arbiter issued flat" diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/gfx9/s_instructions.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/gfx9/s_instructions.py new file mode 100644 index 0000000000..b04f1a7bdd --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/stochastic/json/gfx9/s_instructions.py @@ -0,0 +1,222 @@ +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from __future__ import absolute_import + + +def validate_internal_instructions(sample_records): + allowed_stall_reasons = set( + [ + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_INTERNAL_INSTRUCTION", + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NO_INSTRUCTION_AVAILABLE", + ] + ) + for record in sample_records: + assert record["inst"].startswith("s_nop"), "New internal instruction observed" + assert ( + record["wave_issued"] == 0 + ), "Internal instruction should not be issued to EX" + assert ( + record["snapshot"]["stall_reason"] in allowed_stall_reasons + ), "Invalid stall reason for internal instruction" + + +def validate_waitcnt(sample_records): + allowed_stall_reasons = set( + [ + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_WAITCNT", + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NO_INSTRUCTION_AVAILABLE", + ] + ) + for record in sample_records: + assert record["inst"].startswith("s_waitcnt"), "Waitcnt must start with s_waitcn" + assert record["wave_issued"] == 0, "Waitcnt should not be issued to EX" + assert ( + record["snapshot"]["stall_reason"] in allowed_stall_reasons + ), "Invalid stall reason for waitcnt" + + +def validate_branch_instructions(sample_records): + allowed_stall_reasons = set( + [ + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NO_INSTRUCTION_AVAILABLE", + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ALU_DEPENDENCY", + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_NOT_WIN", + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_WIN_EX_STALL", + ] + ) + allowed_stall_reasons_uncoditional_branches = set( + [ + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NO_INSTRUCTION_AVAILABLE", + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_NOT_WIN", + ] + ) + for record in sample_records: + inst = record["inst"] + inst_type = record["inst_type"] + snapshot = record["snapshot"] + stall_reason = snapshot["stall_reason"] + assert inst.startswith("s_cbranch") or inst.startswith( + "s_branch" + ), "Branch must start with s_cbranch or s_branch" + + if record["wave_issued"] == 1: + if inst.startswith("s_branch"): + # Uncoditional issued branch can only be branch taken + assert ( + inst_type == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BRANCH_TAKEN" + ), "Unconditional branch must be taken" + else: + # Verifying issued branch instructions + assert ( + inst_type == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BRANCH_TAKEN" + or inst_type + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BRANCH_NOT_TAKEN" + ), "Invalid branch type for conditional branch instruction" + + assert ( + snapshot["arb_state_issue_misc"] == 1 + and snapshot["arb_state_stall_misc"] == 0 + ), "Invalid arb state for issued branch instruction" + + else: + # verifying not issued branch instructions + assert ( + stall_reason in allowed_stall_reasons + ), "Invalid stall reason for branch instruction" + + if ( + stall_reason + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_NOT_WIN" + ): + assert ( + snapshot["arb_state_issue_misc"] == 1 + ), "Arbiter must have issued MISC instruction" + + elif ( + stall_reason + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_WIN_EX_STALL" + ): + assert ( + snapshot["arb_state_issue_misc"] == 1 + ), "Arbiter must have issued MISC instruction" + + assert ( + snapshot["arb_state_stall_misc"] == 1 + ), "Arbiter must have stalled MISC instruction" + + # more specific checks for unconditional branches + if inst.startswith("s_branch"): + assert ( + stall_reason in allowed_stall_reasons_uncoditional_branches + ), "Invalid stall reason for unconditional branch instruction" + + +def validate_scalar_instructions(sample_records): + allowed_stall_reasons = set( + [ + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NO_INSTRUCTION_AVAILABLE", + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ALU_DEPENDENCY", + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_NOT_WIN", + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_WIN_EX_STALL", + ] + ) + + for record in sample_records: + snapshot = record["snapshot"] + if record["wave_issued"] == 1: + assert ( + record["inst_type"] == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_SCALAR" + ), "Invalid scalar instruction type" + assert ( + snapshot["arb_state_issue_scalar"] == 1 + ), "Arbiter must have issued scalar instruction" + assert ( + snapshot["arb_state_stall_scalar"] == 0 + ), "Arbiter must have stalled scalar instruction" + else: + stall_reason = snapshot["stall_reason"] + assert ( + stall_reason in allowed_stall_reasons + ), "Invalid stall reason for scalar instruction" + + if ( + stall_reason + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_NOT_WIN" + ): + assert ( + snapshot["arb_state_issue_scalar"] == 1 + ), "Arbiter must have issued scalar instruction" + + elif ( + stall_reason + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_WIN_EX_STALL" + ): + assert ( + snapshot["arb_state_issue_scalar"] == 1 + ), "Arbiter must have issued scalar instruction" + + assert ( + snapshot["arb_state_stall_scalar"] == 1 + ), "Arbiter must have stalled scalar instruction" + + +def validate_barrier_instructions(sample_records): + allowed_stall_reasons = set( + [ + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_NO_INSTRUCTION_AVAILABLE", + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_NOT_WIN", + "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_BARRIER_WAIT", + ] + ) + for record in sample_records: + assert record["inst"].startswith( + "s_barrier" + ), "Barrier instruction must start with s_barrier" + snapshot = record["snapshot"] + if record["wave_issued"] == 1: + assert ( + record["inst_type"] == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_TYPE_BARRIER" + ), "Invalid barrier instruction type" + assert ( + snapshot["arb_state_issue_misc"] == 1 + ), "Arbiter must have issued barrier instruction" + assert ( + snapshot["arb_state_stall_misc"] == 0 + ), "Arbiter must have stalled barrier instruction" + else: + stall_reason = snapshot["stall_reason"] + assert ( + stall_reason in allowed_stall_reasons + ), "Invalid stall reason for barrier instruction" + + if ( + stall_reason + == "ROCPROFILER_PC_SAMPLING_INSTRUCTION_NOT_ISSUED_REASON_ARBITER_NOT_WIN" + ): + assert ( + snapshot["arb_state_issue_misc"] == 1 + ), "Arbiter must have issued misc instruction" + + +# TODO: cover other types of instructions diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/transpose_multiple_agents/CMakeLists.txt b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/transpose_multiple_agents/CMakeLists.txt new file mode 100644 index 0000000000..e53aba9b6b --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/transpose_multiple_agents/CMakeLists.txt @@ -0,0 +1,14 @@ +# +# +# + +set(PACKAGE_OUTPUT_DIR + ${ROCPROFILER_SDK_TESTS_BINARY_DIR}/pytest-packages/rocprofiler_sdk/pc_sampling/transpose_multiple_agents + ) + +set(PC_SAMPLING_PYTHON_SOURCES __init__.py csv.py) + +foreach(_FILE ${PC_SAMPLING_PYTHON_SOURCES}) + configure_file(${CMAKE_CURRENT_LIST_DIR}/${_FILE} ${PACKAGE_OUTPUT_DIR}/${_FILE} + COPYONLY) +endforeach() diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/transpose_multiple_agents/__init__.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/transpose_multiple_agents/__init__.py new file mode 100644 index 0000000000..eeac54c06d --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/transpose_multiple_agents/__init__.py @@ -0,0 +1,23 @@ +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import absolute_import diff --git a/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/transpose_multiple_agents/csv.py b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/transpose_multiple_agents/csv.py new file mode 100644 index 0000000000..4ea09ba546 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/pytest-packages/pc_sampling/transpose_multiple_agents/csv.py @@ -0,0 +1,93 @@ +# MIT License +# +# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from __future__ import absolute_import + +#!/usr/bin/env python3 + +import itertools +import sys +import pytest +import numpy as np +import pandas as pd + + +def validate_all_agents_are_sampled( + input_samples_csv: pd.DataFrame, + input_kernel_trace_csv: pd.DataFrame, + input_agent_info_csv: pd.DataFrame, +): + transpose_kernel_source_line_start = 137 + transpose_kernel_source_line_end = 145 + + mi2xx_mi3xx_agents_df = input_agent_info_csv[ + input_agent_info_csv["Name"].apply( + lambda name: name == "gfx90a" + or name.startswith("gfx94") + or name.startswith("gfx95") + ) + ] + + # Extract samples that originates from know code object it + samples_df = input_samples_csv[input_samples_csv["Dispatch_Id"] != 0].copy() + + # Determine the agent on which sample was generated + # Note: Agent_Id is in the following format e.g., "Agent 3", + # that's why we need a log for extracting integer value of the id. + # Determine the agent on which sample was generated + samples_df["Agent_Id"] = ( + samples_df["Dispatch_Id"] + .map( + input_kernel_trace_csv.set_index("Dispatch_Id")["Agent_Id"] + .str.split(" ") + .str[1] + ) + .astype(np.uint64) + ) + sampled_agents = samples_df["Agent_Id"].unique() + sampled_agents_num = len(sampled_agents) + # all agents must be sampled + assert sampled_agents_num == len(mi2xx_mi3xx_agents_df) + + # separate samples per agents + grouped_samples_per_agent = samples_df.groupby("Agent_Id") + for agent_id, agent_samples_df in grouped_samples_per_agent: + sampled_dispatches = agent_samples_df["Dispatch_Id"].unique() + # at least 1 sampled dispatch per agent + assert len(sampled_dispatches) >= 1 + + # extract decoded samples that are mapped to the transpose.cpp file + transpose_samples_df = samples_df[ + samples_df["Instruction_Comment"].apply( + lambda comment: "transpose-all-agents.cpp" in comment + ) + ].copy() + # determine the line number for each sample + transpose_samples_df["Source_Line_Num"] = transpose_samples_df[ + "Instruction_Comment" + ].apply(lambda source_line: int(source_line.split(":")[-1])) + # assert that line belongs to a kernel range + assert ( + (transpose_samples_df["Source_Line_Num"] >= transpose_kernel_source_line_start) + & (transpose_samples_df["Source_Line_Num"] <= transpose_kernel_source_line_end) + ).all() diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/CMakeLists.txt b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/CMakeLists.txt index bd893c3c83..20a12980d1 100644 --- a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/CMakeLists.txt @@ -3,3 +3,4 @@ # add_subdirectory(host-trap) +add_subdirectory(stochastic) diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/validate.py b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/validate.py index ee914946e6..ab3d4a96f6 100644 --- a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/validate.py +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/validate.py @@ -32,360 +32,19 @@ import pandas as pd # =========================== Validating CSV output -# Keep this in case we decide to revert workgroup_id information -def validate_workgoup_id_x_y_z(df, max_x, max_y, max_z): - assert (df["Workgroup_Size_X"].astype(int) >= 0).all() - assert (df["Workgroup_Size_X"].astype(int) <= max_x).all() - - assert (df["Workgroup_Size_Y"].astype(int) >= 0).all() - assert (df["Workgroup_Size_Y"].astype(int) <= max_y).all() - - assert (df["Workgroup_Size_Z"].astype(int) >= 0).all() - assert (df["Workgroup_Size_Z"].astype(int) <= max_z).all() - - -# Keep this in case we decide to revert wave_id information -def validate_wave_id(df, max_wave_id): - assert (df["Wave_Id"].astype(int) <= max_wave_id).all() - - -# Keep this in case we decide to revert wave_id information -def validate_chiplet(df, max_chiplet): - assert (df["Chiplet"].astype(int) <= max_chiplet).all() - - -def validate_instruction_decoding( - df, - inst_str, - exec_mask_uint64: np.uint64 = None, - source_code_lines_range: (int, int) = None, - all_source_lines_samples=False, -): - # Make a copy, so that we don't work (modify) a view. - df_inst = df[df["Instruction"].apply(lambda inst: inst.startswith(inst_str))].copy() - - assert not df_inst.empty - # assert the exec mask if requested - if exec_mask_uint64 is not None: - assert (df_inst["Exec_Mask"].astype(np.uint64) == exec_mask_uint64).all() - - # assert whether the samples source code lines belongs to the provided range - if source_code_lines_range is not None: - start_range, end_range = source_code_lines_range - # The instruction comment is isually in the following format: /path/to/source/file.cpp:line_num - df_inst["source_line_num"] = df_inst["Instruction_Comment"].apply( - lambda source_line: int(source_line.split(":")[-1]) - ) - assert (df_inst["source_line_num"] >= start_range).all() - assert (df_inst["source_line_num"] <= end_range).all() - # if requested, check if all lines from the range are sampled - if all_source_lines_samples: - assert len(df_inst["source_line_num"].unique()) == ( - end_range - start_range + 1 - ) - - -def validate_instruction_comment(df): - # Instruction comment must always be present, since the testing application - # is built with debug symbols. - assert ( - (df["Instruction_Comment"] != "") & (df["Instruction_Comment"] != "nullptr") - ).all() - - -def validate_instruction_correlation_id_relation(df): - # Samples with no decoded instructions originates from either - # blit kernels or self modifying code. The correlation id for this - # type of samples should alway be zero. - # Thus, Correlation_Id is 0 `iff`` instruction is not decoded. - - # The previous statement has two implications. - # Implication 1: If the instruction is not decoded, then correlation id is 0. - samples_no_instruction_df = df[ - (df["Instruction"] == "") | (df["Instruction"] == "nullptr") - ] - assert (samples_no_instruction_df["Correlation_Id"] == 0).all() - - # Implication 2: If the correlation id is 0, then the instruction is not decoded. - samples_cid_zero_df = df[df["Correlation_Id"] == 0] - assert ( - (samples_cid_zero_df["Instruction"] == "") - | (samples_cid_zero_df["Instruction"] == "nullptr") - ).all() - - assert len(samples_no_instruction_df) == len(samples_cid_zero_df) - - # Since we're not enabling any kind of API tracing, - # internal correlation id should match the dispatch id - assert all(df["Correlation_Id"] == df["Dispatch_Id"]) - - -def validate_exec_mask_based_on_correlation_id(df): - # The function assumes that each kernel launches 1024 blocks. - # Each block contains number of threads that matches correlation ID of the kernel. - # The exec mask of a sample should contain number of ones equal to - # the correlation ID of the kernel during which execution the sample was generated. - df["active_SIMD_threads"] = df["Exec_Mask"].apply( - lambda exec_mask: bin(exec_mask).count("1") - ) - assert (df["active_SIMD_threads"] == df["Correlation_Id"]).all() - - # TODO: Comment out the following code if it causes spurious fails. - # The more conservative constraint based on the experience follows. - # The exec mask of sampled instructions of the kernels respect the following pattern: - # cid -> exec - # 1 -> 0b1 - # 2 -> 0b11 - # 3 -> 0b111 - # ... - # 64 -> 0xffffffffffffffff - - df["Exec_Mask2"] = ( - df["Correlation_Id"].astype(int).apply(lambda x: int("0b" + (x * "1"), 2)) - ) - - # TODO: exec should be in hex and that will ease the comparison - assert (df["Exec_Mask"].astype(np.uint64) == df["Exec_Mask2"].astype(np.uint64)).all() - - -def exec_mask_manipulation_validate_csv(df, all_sampled=False): - assert not df.empty - - validate_instruction_comment(df) - validate_instruction_correlation_id_relation(df) - - # Validate samples with non-zero correlation IDs (and with decoded instructions) - samples_cid_non_zero_df = df[df["Correlation_Id"] != 0] - - # exactly 65 kernels and 65 correlation id - assert (samples_cid_non_zero_df["Correlation_Id"].astype(int) >= 1).all() - assert (samples_cid_non_zero_df["Correlation_Id"].astype(int) <= 65).all() - if all_sampled: - # all correlation IDs must be sampled - assert len(samples_cid_non_zero_df["Correlation_Id"].astype(int).unique()) == 65 - - first_64_kernels_df = samples_cid_non_zero_df[ - samples_cid_non_zero_df["Correlation_Id"] <= 64 - ] - - # Make a copy, so that we don't work (modify) a view. - validate_exec_mask_based_on_correlation_id(first_64_kernels_df.copy()) - - # validate the last kernel - kernel_65_df = df[df["Correlation_Id"] == 65] - - # assert that v_rcp instructions are properly decoded - # the v_rcp is executed by even SIMD threads - validate_instruction_decoding( - kernel_65_df, - "v_rcp_f64", - exec_mask_uint64=np.uint64(int("5555555555555555", 16)), - source_code_lines_range=(288, 387), - all_source_lines_samples=all_sampled, - ) - - # assert that v_rcp_f32 instructions are properly decoded - # the v_rcp_f32 is executed by odd SIMD threads - validate_instruction_decoding( - kernel_65_df, - "v_rcp_f32", - exec_mask_uint64=np.uint64(int("AAAAAAAAAAAAAAAA", 16)), - source_code_lines_range=(391, 490), - all_source_lines_samples=all_sampled, - ) - - def test_validate_pc_sampling_exec_mask_manipulation_csv( input_csv: pd.DataFrame, all_sampled: bool ): + from rocprofiler_sdk.pc_sampling.exec_mask_manipulation.csv import ( + exec_mask_manipulation_validate_csv, + ) + exec_mask_manipulation_validate_csv(input_csv, all_sampled=all_sampled) # ========================= Validating JSON output -def validate_json_exec_mask_manipulation(data_json, all_sampled=False): - # Although functional programming might look more elegant, - # I was trying to avoid multiple iteration over the list of samples. - # Thus, I decided to use procedural programming instead. - # Although, it would be more elegant to wrap some of the checks in dedicated functions, - # I noticed that it can introduce significant overhead, so I decided to inline those checks. - - # the function assume homogenous system - agents = data_json["agents"] - gpu_agents = list(filter(lambda agent: agent["type"] == 2, agents)) - # There should be at least one GPU agent - assert len(gpu_agents) > 0 - first_gpu_agent = gpu_agents[0] - num_xcc = first_gpu_agent["num_xcc"] - max_waves_per_simd = first_gpu_agent["max_waves_per_simd"] - simd_per_cu = first_gpu_agent["simd_per_cu"] - - instructions = data_json["strings"]["pc_sample_instructions"] - comments = data_json["strings"]["pc_sample_comments"] - - # execution mask where even SIMD lanes are active - # correspond to the v_rcp_f64 instructions of the last kernel - even_simds_active_exec_mask = np.uint64(int("5555555555555555", 16)) - # start and end source code lines of the v_rcp_f64 instructions of the last kernel - v_rcp_f64_start_line_num, v_rcp_f64_end_line_num = 288, 387 - # execution mask where even SIMD lanes are active - # correspond to the v_rcp_f64 instructions of the last kernel - odd_simds_active_exec_mask = np.uint64(int("AAAAAAAAAAAAAAAA", 16)) - # start and end source code lines of the v_rcp_f32 0 instructions of the last kernel - v_rcp_f32_start_line_num, v_rcp_f32_end_line_num = 391, 490 - - # sampled wave_ids of the last kernel - kernel65_sampled_wave_in_grp = set() - # sampled source lines of the last kernel matching v_rcp_f64 instructions - kernel65_v_rcp_64_sampled_source_line_set = set() - # sampled source lines of the last kernel matching v_rcp_f64 instructions - kernel65_v_rcp_f32_sampled_source_line_set = set() - # sampled correlation IDs - sampled_cids_set = set() - # pairs of sampled SIMD ids and waveslot IDs - sampled_simd_waveslots_pairs = set() - # sampled chiplets - sampled_chiplets = set() - # sample VMIDs - sampled_vmids = set() - - for sample in data_json["buffer_records"]["pc_sample_host_trap"]: - record = sample["record"] - cid = record["corr_id"]["internal"] - - # pull information from hw_id - hw_id = record["hw_id"] - sampled_chiplets.add(hw_id["chiplet"]) - sampled_simd_waveslots_pairs.add((hw_id["simd_id"], hw_id["wave_id"])) - sampled_vmids.add(hw_id["vm_id"]) - - # Checks specific for all samples - - # cids must be non-negative numbers - assert cid >= 0 - - inst_index = sample["inst_index"] - - # Since we're not enabling any kind of API tracing, the internal correlation id should - # be equal to the dispatch_id - assert cid == record["dispatch_id"] - - if cid == 0: - # Samples originates either from a blit kernel or self-modifying code. - # Thus, code object is uknown, as well as the instruction. - assert record["pc"]["code_object_id"] == 0 - assert inst_index == -1 - else: - # Update set of sampled cids - sampled_cids_set.add(cid) - - # All samples with non-zero correlation ID should pass the following checks - # code object is know, so as the instruction - assert record["pc"]["code_object_id"] != 0 - assert inst_index != -1 - - wgid = record["wrkgrp_id"] - # check corrdinates of the workgroup - assert wgid["x"] >= 0 and wgid["x"] <= 1023 - assert wgid["y"] == 0 - assert wgid["z"] == 0 - - wave_in_grp = record["wave_in_grp"] - exec_mask = record["exec_mask"] - - if cid < 65: - # checks specific for samples from first 64 kernels - assert wave_in_grp == 0 - # inline if possible - # validate_json_exec_mask_based_on_cid(sample.record) - - # The function assumes that each kernel launches 1024 blocks. - # Each block contains number of threads that matches correlation ID of the kernel. - # The exec mask of a sample should contain number of ones equal to - # the correlation ID of the kernel during which execution the sample was generated. - assert bin(exec_mask).count("1") == cid - - # TODO: Comment out the following code if it causes spurious fails. - # The more conservative constraint based on the experience follows. - # The exec mask of sampled instructions of the kernels respect the following pattern: - # cid -> exec - # 1 -> 0b1 - # 2 -> 0b11 - # 3 -> 0b111 - # ... - # 64 -> 0xffffffffffffffff - exec_mask_str = "0b" + "1" * cid - assert np.uint64(exec_mask) == np.uint64(int(exec_mask_str, 2)) - else: - # No more that 65 cids - assert cid == 65 - # Monitor wave_in_group being sampled - kernel65_sampled_wave_in_grp.add(wave_in_grp) - # chekcs specific for samples from the last kernel - assert wave_in_grp >= 0 and wave_in_grp <= 3 - - # validate instruction decoding - inst = instructions[inst_index] - comm = comments[inst_index] - # The instruction comment is isually in the following format: - # /path/to/source/file.cpp:line_num - line_num = int(comm.split(":")[-1]) - if inst.startswith("v_rcp_f64"): - # even SIMD lanes active - assert np.uint64(exec_mask) == even_simds_active_exec_mask - assert ( - line_num >= v_rcp_f64_start_line_num - and line_num <= v_rcp_f64_end_line_num - ) - kernel65_v_rcp_64_sampled_source_line_set.add(line_num) - elif inst.startswith("v_rcp_f32"): - # odd SIMD lanes active - assert np.uint64(exec_mask) == odd_simds_active_exec_mask - assert ( - line_num >= v_rcp_f32_start_line_num - and line_num <= v_rcp_f32_end_line_num - ) - kernel65_v_rcp_f32_sampled_source_line_set.add(line_num) - - if all_sampled: - # All cids that belongs to the range [1, 65] should be samples - assert len(sampled_cids_set) == 65 - - # all wave_ids that belongs to the range [0, 3] should be sampled for the last kernel - assert len(kernel65_sampled_wave_in_grp) == 4 - - # all source lines matches v_rcp_f64 instructions of the last kernel should be sampled - assert len(kernel65_v_rcp_64_sampled_source_line_set) == ( - v_rcp_f64_end_line_num - v_rcp_f64_start_line_num + 1 - ) - # all source lines matches v_rcp_f32 instructions of the last kernel should be sampled - assert len(kernel65_v_rcp_f32_sampled_source_line_set) == ( - v_rcp_f32_end_line_num - v_rcp_f32_start_line_num + 1 - ) - - # all chiplets must be sampled - assert len(sampled_chiplets) == num_xcc - # all (simd ID, waveslot ID) pairs must be samples - assert len(sampled_simd_waveslots_pairs) == simd_per_cu * max_waves_per_simd - - # assert chiplet index - assert all(map(lambda chiplet: 0 <= chiplet < num_xcc, sampled_chiplets)) - # assert (SIMD ID, waveslot ID) combinations - assert all( - map( - lambda simd_waveslot: (0 <= simd_waveslot[0] < simd_per_cu) - and (0 <= simd_waveslot[1] < max_waves_per_simd), - sampled_simd_waveslots_pairs, - ) - ) - - # Apparently, not all dispatches must belong to the same VMID, - # so I'm temporarily disabling the following check. - # # all samples should belong to the same VMID - # assert len(sampled_vmids) == 1 - - def test_validate_pc_sampling_exec_mask_manipulation_json( input_json, input_csv: pd.DataFrame, all_sampled: bool ): @@ -393,7 +52,13 @@ def test_validate_pc_sampling_exec_mask_manipulation_json( # The same amount of samples should be in both CSV and JSON files. assert len(input_csv) == len(data["buffer_records"]["pc_sample_host_trap"]) # # validating JSON output - validate_json_exec_mask_manipulation(data, all_sampled=all_sampled) + from rocprofiler_sdk.pc_sampling.exec_mask_manipulation.json import ( + validate_json_exec_mask_manipulation, + ) + + validate_json_exec_mask_manipulation( + data, pc_sampling_method="host_trap", all_sampled=all_sampled + ) if __name__ == "__main__": diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/transpose-multiple-agents/validate.py b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/transpose-multiple-agents/validate.py index 98ea83aee4..732eafa9eb 100644 --- a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/transpose-multiple-agents/validate.py +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/transpose-multiple-agents/validate.py @@ -34,57 +34,13 @@ def test_multi_agent_support( input_kernel_trace_csv: pd.DataFrame, input_agent_info_csv: pd.DataFrame, ): - transpose_kernel_source_line_start = 137 - transpose_kernel_source_line_end = 145 - - mi2xx_mi3xx_agents_df = input_agent_info_csv[ - input_agent_info_csv["Name"].apply( - lambda name: name == "gfx90a" - or name.startswith("gfx94") - or name.startswith("gfx95") - ) - ] - - # Extract samples that originates from know code object it - samples_df = input_samples_csv[input_samples_csv["Dispatch_Id"] != 0].copy() - - # Determine the agent on which sample was generated - samples_df["Agent_Id"] = ( - samples_df["Dispatch_Id"] - .map( - input_kernel_trace_csv.set_index("Dispatch_Id")["Agent_Id"] - .str.split(" ") - .str[1] - ) - .astype(np.uint64) + from rocprofiler_sdk.pc_sampling.transpose_multiple_agents.csv import ( + validate_all_agents_are_sampled, ) - sampled_agents = samples_df["Agent_Id"].unique() - sampled_agents_num = len(sampled_agents) - # all agents must be sampled - assert sampled_agents_num == len(mi2xx_mi3xx_agents_df) - # separate samples per agents - grouped_samples_per_agent = samples_df.groupby("Agent_Id") - for agent_id, agent_samples_df in grouped_samples_per_agent: - sampled_dispatches = agent_samples_df["Dispatch_Id"].unique() - # at least 1 sampled dispatch per agent - assert len(sampled_dispatches) >= 1 - - # extract decoded samples that are mapped to the transpose.cpp file - transpose_samples_df = samples_df[ - samples_df["Instruction_Comment"].apply( - lambda comment: "transpose-all-agents.cpp" in comment - ) - ].copy() - # determine the line number for each sample - transpose_samples_df["Source_Line_Num"] = transpose_samples_df[ - "Instruction_Comment" - ].apply(lambda source_line: int(source_line.split(":")[-1])) - # assert that line belongs to a kernel range - assert ( - (transpose_samples_df["Source_Line_Num"] >= transpose_kernel_source_line_start) - & (transpose_samples_df["Source_Line_Num"] <= transpose_kernel_source_line_end) - ).all() + validate_all_agents_are_sampled( + input_samples_csv, input_kernel_trace_csv, input_agent_info_csv + ) if __name__ == "__main__": diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/CMakeLists.txt b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/CMakeLists.txt new file mode 100644 index 0000000000..f956eaa95e --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/CMakeLists.txt @@ -0,0 +1,6 @@ +# +# PC sampling tests +# + +add_subdirectory(exec-mask-manipulation) +add_subdirectory(transpose-multiple-agents) diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/exec-mask-manipulation/CMakeLists.txt b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/exec-mask-manipulation/CMakeLists.txt new file mode 100644 index 0000000000..66ced26e33 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/exec-mask-manipulation/CMakeLists.txt @@ -0,0 +1,153 @@ +# +# rocprofv3 tool test +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +project( + rocprofiler-tests-pc-sampling + LANGUAGES CXX + VERSION 0.0.0) + +find_package(rocprofiler-sdk REQUIRED) + +rocprofiler_configure_pytest_files(CONFIG pytest.ini COPY validate.py conftest.py + input.json input.yml) + +add_test( + NAME rocprofv3-test-pc-sampling-stochastic-exec-mask-manipulation-input-cmd-execute + COMMAND + $ --pc-sampling-unit cycles + --pc-sampling-method stochastic --pc-sampling-interval 1048576 -d + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_cmd_input -o out --output-format csv json + -- $) + +string(REPLACE "LD_PRELOAD=" "ROCPROF_PRELOAD=" PRELOAD_ENV + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}") + +set(pc-sampling-env-stochastic-exec-mask-manipulation "${PRELOAD_ENV}") + +set_tests_properties( + rocprofv3-test-pc-sampling-stochastic-exec-mask-manipulation-input-cmd-execute + PROPERTIES TIMEOUT + 45 + LABELS + "integration-tests;pc-sampling;stochastic" + ENVIRONMENT + "${pc-sampling-env-stochastic-exec-mask-manipulation}" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + SKIP_REGULAR_EXPRESSION + "PC sampling unavailable") + +add_test( + NAME rocprofv3-test-pc-sampling-stochastic-exec-mask-manipulation-input-json-execute + COMMAND + $ -i + ${CMAKE_CURRENT_BINARY_DIR}/input.json -d + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_json_input -- + $) + +set_tests_properties( + rocprofv3-test-pc-sampling-stochastic-exec-mask-manipulation-input-json-execute + PROPERTIES TIMEOUT + 45 + LABELS + "integration-tests;pc-sampling;stochastic" + ENVIRONMENT + "${pc-sampling-env-stochastic-exec-mask-manipulation}" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + SKIP_REGULAR_EXPRESSION + "PC sampling unavailable") + +add_test( + NAME rocprofv3-test-pc-sampling-stochastic-exec-mask-manipulation-input-yaml-execute + COMMAND + $ -i + ${CMAKE_CURRENT_BINARY_DIR}/input.yml -d + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_yaml_input -- + $) + +set_tests_properties( + rocprofv3-test-pc-sampling-stochastic-exec-mask-manipulation-input-yaml-execute + PROPERTIES TIMEOUT + 45 + LABELS + "integration-tests;pc-sampling;stochastic" + ENVIRONMENT + "${pc-sampling-env-stochastic-exec-mask-manipulation}" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + SKIP_REGULAR_EXPRESSION + "PC sampling unavailable") + +# ========================= Validation tests + +add_test( + NAME rocprofv3-test-pc-sampling-stochastic-exec-mask-manipulation-input-cmd-validate + COMMAND + ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py -k + test_validate_pc_sampling_ --input-csv + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_cmd_input/out_pc_sampling_stochastic.csv + --input-json ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_cmd_input/out_results.json + --all-sampled False) + +set_tests_properties( + rocprofv3-test-pc-sampling-stochastic-exec-mask-manipulation-input-cmd-validate + PROPERTIES + TIMEOUT + 60 + LABELS + "integration-tests;pc-sampling;stochastic" + DEPENDS + "rocprofv3-test-pc-sampling-stochastic-exec-mask-manipulation-input-cmd-execute" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + SKIP_REGULAR_EXPRESSION + "PC sampling unavailable") + +add_test( + NAME rocprofv3-test-pc-sampling-stochastic-exec-mask-manipulation-input-json-validate + COMMAND + ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py -k + test_validate_pc_sampling_ --input-csv + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_json_input/out_pc_sampling_stochastic.csv + --input-json ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_json_input/out_results.json + --all-sampled False) + +set_tests_properties( + rocprofv3-test-pc-sampling-stochastic-exec-mask-manipulation-input-json-validate + PROPERTIES + TIMEOUT + 60 + LABELS + "integration-tests;pc-sampling;stochastic" + DEPENDS + "rocprofv3-test-pc-sampling-stochastic-exec-mask-manipulation-input-json-execute" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + SKIP_REGULAR_EXPRESSION + "PC sampling unavailable") + +add_test( + NAME rocprofv3-test-pc-sampling-stochastic-exec-mask-manipulation-input-yaml-validate + COMMAND + ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py -k + test_validate_pc_sampling_ --input-csv + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_yaml_input/out_pc_sampling_stochastic.csv + --input-json ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_yaml_input/out_results.json + --all-sampled False) + +set_tests_properties( + rocprofv3-test-pc-sampling-stochastic-exec-mask-manipulation-input-yaml-validate + PROPERTIES + TIMEOUT + 60 + LABELS + "integration-tests;pc-sampling;stochastic" + DEPENDS + "rocprofv3-test-pc-sampling-stochastic-exec-mask-manipulation-input-yaml-execute" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + SKIP_REGULAR_EXPRESSION + "PC sampling unavailable") diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/exec-mask-manipulation/conftest.py b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/exec-mask-manipulation/conftest.py new file mode 100644 index 0000000000..3fa04d8f2e --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/exec-mask-manipulation/conftest.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 + +import json +import os +import pytest +import pandas as pd + +from rocprofiler_sdk.pytest_utils.dotdict import dotdict +from rocprofiler_sdk.pytest_utils import collapse_dict_list + + +def pytest_addoption(parser): + parser.addoption( + "--input-csv", + action="store", + help="Path to CSV file.", + ) + + parser.addoption( + "--input-json", + action="store", + help="Path to CSV file.", + ) + + parser.addoption( + "--all-sampled", + action="store", + help="All SW and HW units must be sampled.", + ) + + +@pytest.fixture +def input_csv(request): + filename = request.config.getoption("--input-csv") + if not os.path.isfile(filename): + # The CSV file is not generated, because the dependency test + # responsible to generate this file was skipped or failed. + # Thus emit the message to skip this test as well. + print("PC sampling unavailable") + else: + with open(filename, "r") as inp: + return pd.read_csv( + inp, + na_filter=False, # parse empty fields as "" + keep_default_na=False, # parse empty fields as "" + dtype={ + "Exec_Mask": "uint64", + "Instruction": str, + "Instruction_Comment": str, + "Wave_Issued_Instruction": bool, + "Instruction_Type": str, + "Stall_Reason": str, + }, + ) + + +@pytest.fixture +def input_json(request): + filename = request.config.getoption("--input-json") + with open(filename, "r") as inp: + # Significant overhead of 5-6secs observed when feeding + # data into the dotdict. + # Using plain python dict instead + return collapse_dict_list(json.load(inp)) + + +@pytest.fixture +def all_sampled(request): + _all_sampled_str = request.config.getoption("--all-sampled") + return _all_sampled_str == "True" diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/exec-mask-manipulation/input.json b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/exec-mask-manipulation/input.json new file mode 100644 index 0000000000..ac3f1e7100 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/exec-mask-manipulation/input.json @@ -0,0 +1,14 @@ +{ + "jobs": [ + { + "pc_sampling_unit": "cycles", + "pc_sampling_method": "stochastic", + "pc_sampling_interval": 1048576, + "output_file": "out", + "output_format": [ + "csv", + "json" + ] + } + ] +} diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/exec-mask-manipulation/input.yml b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/exec-mask-manipulation/input.yml new file mode 100644 index 0000000000..a12fc500c0 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/exec-mask-manipulation/input.yml @@ -0,0 +1,8 @@ +jobs: + - pc_sampling_unit: "cycles" + pc_sampling_method: "stochastic" + pc_sampling_interval: 1048576 + output_file: "out" + output_format: + - "csv" + - "json" diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/exec-mask-manipulation/pytest.ini b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/exec-mask-manipulation/pytest.ini new file mode 100644 index 0000000000..5e1e1c14a0 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/exec-mask-manipulation/pytest.ini @@ -0,0 +1,5 @@ + +[pytest] +addopts = --durations=20 -rA -s -vv +testpaths = validate.py +pythonpath = @ROCPROFILER_SDK_TESTS_BINARY_DIR@/pytest-packages diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/exec-mask-manipulation/validate.py b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/exec-mask-manipulation/validate.py new file mode 100644 index 0000000000..8b257143a9 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/exec-mask-manipulation/validate.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 + +import sys +import pytest +import numpy as np +import pandas as pd + + +# =========================== Validating fields common for both host-trap and stochastic CSV output + + +def test_validate_pc_sampling_exec_mask_manipulation_csv( + input_csv: pd.DataFrame, all_sampled: bool +): + from rocprofiler_sdk.pc_sampling.exec_mask_manipulation.csv import ( + exec_mask_manipulation_validate_csv, + ) + + exec_mask_manipulation_validate_csv(input_csv, all_sampled=all_sampled) + + +# # ========================= Validating fields common for both host-trap and stochastic JSON output + + +def test_validate_pc_sampling_exec_mask_manipulation_json( + input_json, input_csv: pd.DataFrame, all_sampled: bool +): + data = input_json["rocprofiler-sdk-tool"] + # The same amount of samples should be in both CSV and JSON files. + assert len(input_csv) == len(data["buffer_records"]["pc_sample_stochastic"]) + # # validating JSON output + from rocprofiler_sdk.pc_sampling.exec_mask_manipulation.json import ( + validate_json_exec_mask_manipulation, + ) + + validate_json_exec_mask_manipulation( + data, pc_sampling_method="stochastic", all_sampled=all_sampled + ) + + +# ======================== Validating fields specific for stochastic sampling + + +def test_validate_pc_sampling_stochastic_specific_csv(input_csv: pd.DataFrame): + from rocprofiler_sdk.pc_sampling.stochastic.csv.gfx9 import ( + validate_stochastic_samples_csv, + ) + + validate_stochastic_samples_csv(input_csv) + + +def test_validate_pc_sampling_stochastic_specific_json(input_json): + from rocprofiler_sdk.pc_sampling.stochastic.json.gfx9 import ( + validate_stochastic_samples_json, + ) + + validate_stochastic_samples_json(input_json["rocprofiler-sdk-tool"]) + + +if __name__ == "__main__": + exit_code = pytest.main(["-x", __file__] + sys.argv[1:]) + sys.exit(exit_code) diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/transpose-multiple-agents/CMakeLists.txt b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/transpose-multiple-agents/CMakeLists.txt new file mode 100644 index 0000000000..d34bc74852 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/transpose-multiple-agents/CMakeLists.txt @@ -0,0 +1,171 @@ +# +# rocprofv3 tool test +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +project( + rocprofiler-tests-pc-sampling + LANGUAGES CXX + VERSION 0.0.0) + +find_package(rocprofiler-sdk REQUIRED) + +rocprofiler_configure_pytest_files(CONFIG pytest.ini COPY validate.py conftest.py + input.json input.yml) + +# To ensure we sample all agents, use 16 threads each assigned to one agent in a round +# robin manner. To keep the job per device reasonable, each thread offloads 100 instances +# of a transpose kernel. +set(TRANSPOSE_NUM_THREADS 16) +set(TRANSPOSE_NUM_ITERATIONS 100) + +add_test( + NAME rocprofv3-test-pc-sampling-stochastic-transpose-multiple-agents-input-cmd-execute + COMMAND + $ --kernel-trace --pc-sampling-unit + cycles --pc-sampling-method stochastic --pc-sampling-interval 1048576 -d + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_cmd_input -o out --output-format csv json + -- $ ${TRANSPOSE_NUM_THREADS} ${TRANSPOSE_NUM_ITERATIONS}) + +string(REPLACE "LD_PRELOAD=" "ROCPROF_PRELOAD=" PRELOAD_ENV + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}") + +set(pc-sampling-env-stochastic-transpose-multiple-agents "${PRELOAD_ENV}") + +set_tests_properties( + rocprofv3-test-pc-sampling-stochastic-transpose-multiple-agents-input-cmd-execute + PROPERTIES TIMEOUT + 45 + LABELS + "integration-tests;pc-sampling;stochastic" + ENVIRONMENT + "${pc-sampling-env-stochastic-transpose-multiple-agents}" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + SKIP_REGULAR_EXPRESSION + "PC sampling unavailable") + +add_test( + NAME rocprofv3-test-pc-sampling-stochastic-transpose-multiple-agents-input-json-execute + COMMAND + $ -i + ${CMAKE_CURRENT_BINARY_DIR}/input.json -d + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_json_input -- $ + ${TRANSPOSE_NUM_THREADS} ${TRANSPOSE_NUM_ITERATIONS}) + +set_tests_properties( + rocprofv3-test-pc-sampling-stochastic-transpose-multiple-agents-input-json-execute + PROPERTIES TIMEOUT + 45 + LABELS + "integration-tests;pc-sampling;stochastic" + ENVIRONMENT + "${pc-sampling-env-stochastic-transpose-multiple-agents}" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + SKIP_REGULAR_EXPRESSION + "PC sampling unavailable") + +add_test( + NAME rocprofv3-test-pc-sampling-stochastic-transpose-multiple-agents-input-yaml-execute + COMMAND + $ -i + ${CMAKE_CURRENT_BINARY_DIR}/input.yml -d + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_yaml_input -- $ + ${TRANSPOSE_NUM_THREADS} ${TRANSPOSE_NUM_ITERATIONS}) + +set_tests_properties( + rocprofv3-test-pc-sampling-stochastic-transpose-multiple-agents-input-yaml-execute + PROPERTIES TIMEOUT + 45 + LABELS + "integration-tests;pc-sampling;stochastic" + ENVIRONMENT + "${pc-sampling-env-stochastic-transpose-multiple-agents}" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + SKIP_REGULAR_EXPRESSION + "PC sampling unavailable") + +# ========================= Validation tests + +add_test( + NAME rocprofv3-test-pc-sampling-stochastic-transpose-multiple-agents-input-cmd-validate + COMMAND + ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py -k test_ + --input-samples-csv + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_cmd_input/out_pc_sampling_stochastic.csv + --input-samples-json + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_cmd_input/out_results.json + --input-kernel-trace-csv + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_cmd_input/out_kernel_trace.csv + --input-agent-info-csv + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_cmd_input/out_agent_info.csv) + +set_tests_properties( + rocprofv3-test-pc-sampling-stochastic-transpose-multiple-agents-input-cmd-validate + PROPERTIES + TIMEOUT + 60 + LABELS + "integration-tests;pc-sampling;stochastic" + DEPENDS + "rocprofv3-test-pc-sampling-stochastic-transpose-multiple-agents-input-cmd-execute" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + SKIP_REGULAR_EXPRESSION + "PC sampling unavailable") + +add_test( + NAME rocprofv3-test-pc-sampling-stochastic-transpose-multiple-agents-input-json-validate + COMMAND + ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py -k test_ + --input-samples-csv + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_json_input/out_pc_sampling_stochastic.csv + --input-samples-json + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_json_input/out_results.json + --input-kernel-trace-csv + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_json_input/out_kernel_trace.csv + --input-agent-info-csv + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_json_input/out_agent_info.csv) + +set_tests_properties( + rocprofv3-test-pc-sampling-stochastic-transpose-multiple-agents-input-json-validate + PROPERTIES + TIMEOUT + 60 + LABELS + "integration-tests;pc-sampling;stochastic" + DEPENDS + "rocprofv3-test-pc-sampling-stochastic-transpose-multiple-agents-input-json-execute" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + SKIP_REGULAR_EXPRESSION + "PC sampling unavailable") + +add_test( + NAME rocprofv3-test-pc-sampling-stochastic-transpose-multiple-agents-input-yaml-validate + COMMAND + ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py -k test_ + --input-samples-csv + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_yaml_input/out_pc_sampling_stochastic.csv + --input-samples-json + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_json_input/out_results.json + --input-kernel-trace-csv + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_yaml_input/out_kernel_trace.csv + --input-agent-info-csv + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_yaml_input/out_agent_info.csv) + +set_tests_properties( + rocprofv3-test-pc-sampling-stochastic-transpose-multiple-agents-input-yaml-validate + PROPERTIES + TIMEOUT + 60 + LABELS + "integration-tests;pc-sampling;stochastic" + DEPENDS + "rocprofv3-test-pc-sampling-stochastic-transpose-multiple-agents-input-yaml-execute" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + SKIP_REGULAR_EXPRESSION + "PC sampling unavailable") diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/transpose-multiple-agents/conftest.py b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/transpose-multiple-agents/conftest.py new file mode 100644 index 0000000000..73ea92a918 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/transpose-multiple-agents/conftest.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +import json +import os +import pytest +import pandas as pd + +from rocprofiler_sdk.pytest_utils.dotdict import dotdict +from rocprofiler_sdk.pytest_utils import collapse_dict_list + + +def pytest_addoption(parser): + parser.addoption( + "--input-samples-csv", + action="store", + help="Path to CSV file containing PC samples.", + ) + + parser.addoption( + "--input-kernel-trace-csv", + action="store", + help="Path to CSV file containing kernel trace.", + ) + + parser.addoption( + "--input-agent-info-csv", + action="store", + help="Path to CSV file containing agents information.", + ) + + parser.addoption( + "--input-samples-json", + action="store", + help="Path to JSON file containing PC samples.", + ) + + +@pytest.fixture +def input_samples_csv(request): + filename = request.config.getoption("--input-samples-csv") + if not os.path.isfile(filename): + # The CSV file is not generated, because the dependency test + # responsible to generate this file was skipped or failed. + # Thus emit the message to skip this test as well. + print("PC sampling unavailable") + else: + with open(filename, "r") as inp: + return pd.read_csv( + inp, + na_filter=False, # parse empty fields as "" + keep_default_na=False, # parse empty fields as "" + dtype={ + "Exec_Mask": "uint64", + "Instruction": str, + "Instruction_Comment": str, + "Wave_Issued_Instruction": bool, + "Instruction_Type": str, + "Stall_Reason": str, + }, + ) + + +@pytest.fixture +def input_kernel_trace_csv(request): + filename = request.config.getoption("--input-kernel-trace-csv") + with open(filename, "r") as inp: + return pd.read_csv(inp) + + +@pytest.fixture +def input_agent_info_csv(request): + filename = request.config.getoption("--input-agent-info-csv") + with open(filename, "r") as inp: + return pd.read_csv(inp) + + +@pytest.fixture +def input_samples_json(request): + filename = request.config.getoption("--input-samples-json") + with open(filename, "r") as inp: + # Significant overhead of 5-6secs observed when feeding + # data into the dotdict. + # Using plain python dict instead + return collapse_dict_list(json.load(inp)) diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/transpose-multiple-agents/input.json b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/transpose-multiple-agents/input.json new file mode 100644 index 0000000000..2cacdcfac6 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/transpose-multiple-agents/input.json @@ -0,0 +1,15 @@ +{ + "jobs": [ + { + "kernel_trace": true, + "pc_sampling_unit": "cycles", + "pc_sampling_method": "stochastic", + "pc_sampling_interval": 1048576, + "output_file": "out", + "output_format": [ + "csv", + "json" + ] + } + ] +} diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/transpose-multiple-agents/input.yml b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/transpose-multiple-agents/input.yml new file mode 100644 index 0000000000..1690e3594a --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/transpose-multiple-agents/input.yml @@ -0,0 +1,9 @@ +jobs: + - kernel_trace: true + pc_sampling_unit: "cycles" + pc_sampling_method: "stochastic" + pc_sampling_interval: 1048576 + output_file: "out" + output_format: + - "csv" + - "json" diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/transpose-multiple-agents/pytest.ini b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/transpose-multiple-agents/pytest.ini new file mode 100644 index 0000000000..5e1e1c14a0 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/transpose-multiple-agents/pytest.ini @@ -0,0 +1,5 @@ + +[pytest] +addopts = --durations=20 -rA -s -vv +testpaths = validate.py +pythonpath = @ROCPROFILER_SDK_TESTS_BINARY_DIR@/pytest-packages diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/transpose-multiple-agents/validate.py b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/transpose-multiple-agents/validate.py new file mode 100644 index 0000000000..e409e99293 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/stochastic/transpose-multiple-agents/validate.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 + +import itertools +import sys +import pytest +import numpy as np +import pandas as pd + + +# ===================== validation common for both host-trap and stochastic sampling +def test_multi_agent_support( + input_samples_csv: pd.DataFrame, + input_kernel_trace_csv: pd.DataFrame, + input_agent_info_csv: pd.DataFrame, +): + from rocprofiler_sdk.pc_sampling.transpose_multiple_agents.csv import ( + validate_all_agents_are_sampled, + ) + + validate_all_agents_are_sampled( + input_samples_csv, input_kernel_trace_csv, input_agent_info_csv + ) + + +# =================== validation specific to stochastic sampling + + +def test_validate_pc_sampling_stochastic_specific_csv(input_samples_csv: pd.DataFrame): + from rocprofiler_sdk.pc_sampling.stochastic.csv.gfx9 import ( + validate_stochastic_samples_csv, + ) + + validate_stochastic_samples_csv(input_samples_csv) + + +def test_validate_pc_sampling_stochastic_specific_json(input_samples_json): + from rocprofiler_sdk.pc_sampling.stochastic.json.gfx9 import ( + validate_stochastic_samples_json, + ) + + validate_stochastic_samples_json(input_samples_json["rocprofiler-sdk-tool"]) + + +if __name__ == "__main__": + exit_code = pytest.main(["-x", __file__] + sys.argv[1:]) + sys.exit(exit_code)