From 9cfbfb506031b52b15b53d9eb50ce69d57e366eb Mon Sep 17 00:00:00 2001 From: "Nagaraj, Sriraksha" Date: Wed, 4 Dec 2024 18:32:48 -0600 Subject: [PATCH] rocprofv3: PC Sampling Support (#14) * Adding tool pc sampling support Fixing merge issue tool support on SDKupdates link amd-comgr Sanitizer failure fix fix format Addressing review comments misc fix Adding dispatch id to the CSV output AddingCHANGELOG [ROCProfV3][PC Sampling] Initial ROCProfV3 PC sampling tests for JSON and CSV formats (#17) ROCProfV3 initial tests for JSON and CSV output. Simple kernels that simplify the verification of samples to instruction decoding has been introduced. removing option to enable pc sampling explicitly Adding documentation no pc-sampling option in tests anymore Addressing review comments Updating docs an option for choosing whether all units must be sampled try ignoring PC sampling tests (#36) * run pc-sampling tests on MI2xx runners * use v_fmac_f32 instead of s_nop 0 in tests * fixing docs [ROCm/rocprofiler-sdk commit: 50b185b9acd789aec919e3ed326c72fbf1e2c2b1] --- projects/rocprofiler-sdk/CHANGELOG.md | 1 + .../rocprofiler-sdk/source/bin/rocprofv3.py | 37 ++ .../source/docs/how-to/using-rocprofv3.rst | 54 ++ .../source/docs/rocprofv3-schema.json | 132 ++++- .../source/docs/rocprofv3_input_schema.json | 14 +- .../rocprofiler-sdk/cxx/serialization.hpp | 38 ++ .../source/lib/output/buffered_output.hpp | 4 + .../rocprofiler-sdk/source/lib/output/csv.hpp | 23 +- .../source/lib/output/domain_type.cpp | 4 + .../source/lib/output/domain_type.hpp | 1 + .../source/lib/output/generateCSV.cpp | 59 ++ .../source/lib/output/generateCSV.hpp | 5 + .../source/lib/output/generateJSON.cpp | 10 +- .../source/lib/output/generateJSON.hpp | 3 +- .../source/lib/output/generateStats.cpp | 8 + .../source/lib/output/generateStats.hpp | 4 + .../source/lib/output/metadata.cpp | 92 ++- .../source/lib/output/metadata.hpp | 48 +- .../source/lib/output/pc_sample_transform.hpp | 83 +++ .../lib/rocprofiler-sdk-tool/CMakeLists.txt | 4 +- .../lib/rocprofiler-sdk-tool/config.cpp | 16 + .../lib/rocprofiler-sdk-tool/config.hpp | 45 +- .../source/lib/rocprofiler-sdk-tool/tool.cpp | 128 ++++- .../rocprofiler-sdk/tests/bin/CMakeLists.txt | 1 + .../tests/bin/pc-sampling/CMakeLists.txt | 11 + .../exec-mask-manipulation/CMakeLists.txt | 43 ++ .../exec_mask_manipulation.cpp | 537 ++++++++++++++++++ .../tests/rocprofv3/CMakeLists.txt | 1 + .../rocprofv3/pc-sampling/CMakeLists.txt | 5 + .../pc-sampling/host-trap/CMakeLists.txt | 5 + .../exec-mask-manipulation/CMakeLists.txt | 150 +++++ .../exec-mask-manipulation/conftest.py | 67 +++ .../exec-mask-manipulation/input.json | 14 + .../exec-mask-manipulation/input.yml | 4 + .../exec-mask-manipulation/pytest.ini | 5 + .../exec-mask-manipulation/validate.py | 376 ++++++++++++ 36 files changed, 1977 insertions(+), 55 deletions(-) mode change 100755 => 100644 projects/rocprofiler-sdk/source/lib/output/domain_type.cpp mode change 100755 => 100644 projects/rocprofiler-sdk/source/lib/output/generateCSV.cpp create mode 100644 projects/rocprofiler-sdk/source/lib/output/pc_sample_transform.hpp mode change 100755 => 100644 projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp create mode 100644 projects/rocprofiler-sdk/tests/bin/pc-sampling/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/tests/bin/pc-sampling/exec-mask-manipulation/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/tests/bin/pc-sampling/exec-mask-manipulation/exec_mask_manipulation.cpp create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/conftest.py create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/input.json create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/input.yml create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/pytest.ini create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/validate.py diff --git a/projects/rocprofiler-sdk/CHANGELOG.md b/projects/rocprofiler-sdk/CHANGELOG.md index 4145140d01..ca9c4e5bd2 100644 --- a/projects/rocprofiler-sdk/CHANGELOG.md +++ b/projects/rocprofiler-sdk/CHANGELOG.md @@ -109,6 +109,7 @@ Full documentation for ROCprofiler-SDK is available at [rocm.docs.amd.com/projec - Check to force tools to initialize context id with zero - Support to specify hardware counters for collection using rocprofv3 as `rocprofv3 --pmc [COUNTER [COUNTER ...]]` - Memory Allocation Tracing +- PC sampling tool support with CSV and JSON output formats ### Changed diff --git a/projects/rocprofiler-sdk/source/bin/rocprofv3.py b/projects/rocprofiler-sdk/source/bin/rocprofv3.py index 8209b04eec..69fa85587e 100755 --- a/projects/rocprofiler-sdk/source/bin/rocprofv3.py +++ b/projects/rocprofiler-sdk/source/bin/rocprofv3.py @@ -4,6 +4,7 @@ import os import sys import argparse import subprocess +import numpy class dotdict(dict): @@ -167,6 +168,30 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins help="Collect tracing data for HIP API, HSA API, Marker (ROCTx) API, RCCL API, Memory operations (copies, scratch, and allocations), and Kernel dispatches.", ) + pc_sampling_options = parser.add_argument_group("PC sampling options") + + pc_sampling_options.add_argument( + "--pc-sampling-unit", + help="", + default=None, + type=str.lower, + choices=("instructions", "cycles", "time"), + ) + + pc_sampling_options.add_argument( + "--pc-sampling-method", + help="", + default=None, + type=str.lower, + choices=("stochastic", "host_trap"), + ) + + pc_sampling_options.add_argument( + "--pc-sampling-interval", + help="", + default=None, + type=numpy.uint64, + ) basic_tracing_options = parser.add_argument_group("Basic tracing options") # Add the arguments @@ -904,6 +929,18 @@ def run(app_args, args, **kwargs): if args.log_level in ("info", "trace", "env"): log_config(app_env) + if args.pc_sampling_unit or args.pc_sampling_method or args.pc_sampling_method: + if not ( + args.pc_sampling_unit and args.pc_sampling_method and args.pc_sampling_method + ): + fatal_error("All three PC sampling configurations need to be set") + + update_env("ROCPROFILER_PC_SAMPLING_BETA_ENABLED", "ON") + update_env("ROCPROF_PC_SAMPLING_UNIT", args.pc_sampling_unit) + update_env("ROCPROF_PC_SAMPLING_METHOD", args.pc_sampling_method) + update_env("ROCPROF_PC_SAMPLING_INTERVAL", args.pc_sampling_interval) + update_env("ROCPROF_ENABLE_PC_SAMPLING", args.pc_sampling) + if use_execv: # does not return os.execvpe(app_args[0], app_args, env=app_env) diff --git a/projects/rocprofiler-sdk/source/docs/how-to/using-rocprofv3.rst b/projects/rocprofiler-sdk/source/docs/how-to/using-rocprofv3.rst index 70a945fdc4..5f1adf7ae4 100644 --- a/projects/rocprofiler-sdk/source/docs/how-to/using-rocprofv3.rst +++ b/projects/rocprofiler-sdk/source/docs/how-to/using-rocprofv3.rst @@ -162,6 +162,18 @@ Here is the sample of commonly used ``rocprofv3`` command-line options. Some opt - Perfetto shared memory size hint in KB. default: 64 KB - Extension + * - ``--pc-sampling-method`` + - Type of PC Sampling, currently only host trap method is supported + - PC Sampling Configurations + + * - ``--pc-sampling-unit`` + - The unit appropriate to the PC sampling type/method, currently only time unit is supported + - PC Sampling Configurations + + * - ``--pc-sampling-interval`` + - Frequency at which PC samples are generated + - PC Sampling Configurations + To see exhaustive list of ``rocprofv3`` options, run: .. code-block:: bash @@ -675,6 +687,9 @@ Properties trace. - **``preload``** *(array)*: Libraries to prepend to LD_PRELOAD (usually for sanitizers). + - **``pc_sampling_unit``** *(string)*: pc sampling unit. + - **``pc_sampling_method``** *(string)*: pc sampling method. + - **``pc_sampling_interval``** *(integer)*: pc sampling interval. .. code-block:: shell @@ -1039,6 +1054,14 @@ Properties - **`id`** *(integer, required)*: Dimension ID. - **`instance_size`** *(integer, required)*: Size of the instance. - **`name`** *(string, required)*: Name of the dimension. + - **``pc_sample_instructions``** *(array)*: Array of decoded + instructions matching sampled PCs from pc_sample_host_trap + section. + - **``pc_sample_comments``** *(array)*: Comments matching + assembly instructions from pc_sample_instructions array. If + debug symbols are available, comments provide instructions + to source-line mapping. Otherwise, a comment is an empty + string. - **`code_objects`** *(array, required)*: Code object records. - **Items** *(object)* - **`size`** *(integer, required)*: Size of the code object. @@ -1103,6 +1126,37 @@ Properties - **`arch_vgpr_count`** *(integer, required)*: Count of VGPRs. - **`sgpr_count`** *(integer, required)*: Count of SGPRs. - **`lds_block_size_v`** *(integer, required)*: Size of LDS block. + - **``pc_sample_host_trap``** *(array)*: Host Trap PC Sampling records. + - **Items** *(object)* + - **``hw_id``** *(object)*: Describes hardware part on which sampled wave was running. + - **``chiplet``** *(integer)*: Chiplet index. + - **``wave_id``** *(integer)*: Wave slot index. + - **``simd_id``** *(integer)*: SIMD index. + - **``pipe_id``** *(integer)*: Pipe index. + - **``cu_or_wgp_id``** *(integer)*: Index of compute unit or workgroup processer. + - **``shader_array_id``** *(integer)*: Shader array index. + - **``shader_engine_id``** *(integer)*: Shader engine + index. + - **``workgroup_id``** *(integer)*: Workgroup position in the 3D. + - **``vm_id``** *(integer)*: Virtual memory ID. + - **``queue_id``** *(integer)*: Queue id. + - **``microengine_id``** *(integer)*: ACE + (microengine) index. + - **``pc``** *(object)*: Encapsulates information about + sampled PC. + - **``code_object_id``** *(integer)*: Code object id. + - **``code_object_offset``** *(integer)*: Offset within the object if the latter is known. Otherwise, virtual address of the PC. + - **``exec_mask``** *(integer)*: Execution mask indicating active SIMD lanes of sampled wave. + - **``timestamp``** *(integer)*: Timestamp. + - **``dispatch_id``** *(integer)*: Dispatch id. + - **``correlation_id``** *(object)*: Correlation ID information. + - **``internal``** *(integer)*: Internal correlation ID. + - **``external``** *(integer)*: External correlation ID. + - **``rocprofiler_dim3_t``** *(object)*: Position of the workgroup in 3D grid. + - **``x``** *(integer)*: Dimension x. + - **``y``** *(integer)*: Dimension y. + - **``z``** *(integer)*: Dimension z. + - **``wave_in_group``** *(integer)*: Wave position within the workgroup (0-31). - **`buffer_records`** *(object, required)*: Buffer record details. - **`kernel_dispatch`** *(array)*: Kernel dispatch records. - **Items** *(object)* diff --git a/projects/rocprofiler-sdk/source/docs/rocprofv3-schema.json b/projects/rocprofiler-sdk/source/docs/rocprofv3-schema.json index f9438dc752..4cbd97aa7b 100644 --- a/projects/rocprofiler-sdk/source/docs/rocprofv3-schema.json +++ b/projects/rocprofiler-sdk/source/docs/rocprofv3-schema.json @@ -559,6 +559,14 @@ "required": [ "dimension_ids" ] + }, + "pc_sample_instructions": { + "type": "array", + "description": "Array of decoded instructions matching sampled PCs from pc_sample_host_trap section." + }, + "pc_sample_comments": { + "type": "array", + "description": "Comments matching assembly instructions from pc_sample_instructions array. If debug symbols are available, comments provide instructions to source-line mapping. Otherwise, a comment is an empty string." } } }, @@ -925,7 +933,129 @@ "lds_block_size_v" ] } - } + }, + "pc_sample_host_trap": { + "type": "array", + "description": "Host Trap PC Sampling records.", + "items": { + "type": "object", + "properties": { + "hw_id": { + "type": "object", + "description" : "Describes hardware part on which sampled wave was running.", + "properties": { + "chiplet":{ + "type": "integer", + "description": "Chiplet index." + }, + "wave_id ":{ + "type": "integer", + "description": "Wave slot index." + }, + "simd_id":{ + "type": "integer", + "description": "SIMD index." + }, + "pipe_id ":{ + "type": "integer", + "description": "Pipe index." + }, + "cu_or_wgp_id":{ + "type": "integer", + "description": "Index of compute unit or workgroup processer." + }, + "shader_array_id":{ + "type": "integer", + "description": "Shader array index." + }, + "shader_engine_id":{ + "type": "integer", + "description": "Shader engine index." + }, + "workgroup_id":{ + "type": "integer", + "description": "Workgroup position in the 3D." + }, + "vm_id":{ + "type": "integer", + "description": "Virtual memory ID." + }, + "queue_id":{ + "type": "integer", + "description": "Queue id." + }, + "microengine_id":{ + "type": "integer", + "description": "ACE (microengine) index." + } + } + }, + "pc": { + "type": "object", + "description": "Encapsulates information about sampled PC.", + "properties": { + "code_object_id":{ + "type": "integer", + "description": "Code object id" + }, + "code_object_offset":{ + "type": "integer", + "description": "Offset within the object if the latter is known. Otherwise, virtual address of the PC." + } + } + }, + "exec_mask":{ + "type": "integer", + "description": "Execution mask indicating active SIMD lanes of sampled wave." + }, + "timestamp":{ + "type": "integer", + "description": "Timestamp." + }, + "dispatch_id":{ + "type": "integer", + "description": "Dispatch id." + }, + "correlation_id": { + "type": "object", + "description": "Correlation ID information.", + "properties": { + "internal": { + "type": "integer", + "description": "Internal correlation ID." + }, + "external": { + "type": "integer", + "description": "External correlation ID." + } + } + }, + "rocprofiler_dim3_t": { + "type": "object", + "description": " Position of the workgroup in 3D grid.", + "properties": { + "x": { + "type": "integer", + "description": "Dimension x." + }, + "y": { + "type": "integer", + "description": "Dimension y." + }, + "z": { + "type": "integer", + "description": "Dimension z." + } + } + }, + "wave_in_group": { + "type": "integer", + "description": "Wave position within the workgroup (0-31)." + } + } + } + } + } }, "buffer_records": { diff --git a/projects/rocprofiler-sdk/source/docs/rocprofv3_input_schema.json b/projects/rocprofiler-sdk/source/docs/rocprofv3_input_schema.json index 08f999a639..63eb21ef79 100644 --- a/projects/rocprofiler-sdk/source/docs/rocprofv3_input_schema.json +++ b/projects/rocprofiler-sdk/source/docs/rocprofv3_input_schema.json @@ -144,7 +144,19 @@ "preload":{ "type": "array", "description": "Libraries to prepend to LD_PRELOAD (usually for sanitizers)" - } + }, + "pc_sampling_unit": { + "type": "string", + "description": "pc sampling unit" + }, + "pc_sampling_method": { + "type": "string", + "description": "pc sampling method" + }, + "pc_sampling_interval": { + "type": "integer", + "description": "pc sampling interval" + } } } } diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization.hpp b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization.hpp index ff62fc7d9d..5c1e053b8c 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization.hpp +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization.hpp @@ -735,6 +735,44 @@ save(ArchiveT& ar, rocprofiler_agent_cache_t data) ROCP_SDK_SAVE_DATA_FIELD(latency); ROCP_SDK_SAVE_DATA_FIELD(type); } +template +void +save(ArchiveT& ar, rocprofiler_pc_t data) +{ + ROCP_SDK_SAVE_DATA_FIELD(code_object_id); + ROCP_SDK_SAVE_DATA_FIELD(code_object_offset); +} + +template +void +save(ArchiveT& ar, rocprofiler_pc_sampling_hw_id_v0_t data) +{ + ROCP_SDK_SAVE_DATA_BITFIELD("chiplet", chiplet); + ROCP_SDK_SAVE_DATA_BITFIELD("wave_id", wave_id); + ROCP_SDK_SAVE_DATA_BITFIELD("simd_id", simd_id); + ROCP_SDK_SAVE_DATA_BITFIELD("pipe_id", pipe_id); + ROCP_SDK_SAVE_DATA_BITFIELD("cu_or_wgp_id", cu_or_wgp_id); + ROCP_SDK_SAVE_DATA_BITFIELD("shader_array_id", shader_array_id); + ROCP_SDK_SAVE_DATA_BITFIELD("shader_engine_id", shader_engine_id); + ROCP_SDK_SAVE_DATA_BITFIELD("workgroup_id ", workgroup_id); + ROCP_SDK_SAVE_DATA_BITFIELD("vm_id", vm_id); + ROCP_SDK_SAVE_DATA_BITFIELD("queue_id", queue_id); + ROCP_SDK_SAVE_DATA_BITFIELD("microengine_id", microengine_id); +} + +template +void +save(ArchiveT& ar, rocprofiler_pc_sampling_record_host_trap_v0_t data) +{ + ROCP_SDK_SAVE_DATA_FIELD(hw_id); + ROCP_SDK_SAVE_DATA_FIELD(pc); + ROCP_SDK_SAVE_DATA_FIELD(exec_mask); + ROCP_SDK_SAVE_DATA_FIELD(timestamp); + ROCP_SDK_SAVE_DATA_FIELD(dispatch_id); + ROCP_SDK_SAVE_DATA_VALUE("corr_id", correlation_id); + ROCP_SDK_SAVE_DATA_VALUE("wrkgrp_id", workgroup_id); + ROCP_SDK_SAVE_DATA_BITFIELD("wave_in_grp", wave_in_group); +} template void diff --git a/projects/rocprofiler-sdk/source/lib/output/buffered_output.hpp b/projects/rocprofiler-sdk/source/lib/output/buffered_output.hpp index 992f40605b..27f943c774 100644 --- a/projects/rocprofiler-sdk/source/lib/output/buffered_output.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/buffered_output.hpp @@ -24,6 +24,7 @@ #include "counter_info.hpp" #include "generator.hpp" +#include "pc_sample_transform.hpp" #include "statistics.hpp" #include "tmp_file_buffer.hpp" @@ -159,5 +160,8 @@ using memory_allocation_buffered_output_t = using counter_records_buffered_output_t = ::rocprofiler::tool::buffered_output; +using pc_sampling_host_trap_buffered_output_t = + buffered_output; } // namespace tool } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/output/csv.hpp b/projects/rocprofiler-sdk/source/lib/output/csv.hpp index f66a2561f6..d3a7f7dd4b 100644 --- a/projects/rocprofiler-sdk/source/lib/output/csv.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/csv.hpp @@ -99,17 +99,18 @@ struct csv_encoder } }; -using api_csv_encoder = csv_encoder<7>; -using agent_info_csv_encoder = csv_encoder<53>; -using kernel_trace_csv_encoder = csv_encoder<18>; -using counter_collection_csv_encoder = csv_encoder<18>; -using memory_copy_csv_encoder = csv_encoder<7>; -using memory_allocation_csv_encoder = csv_encoder<8>; -using marker_csv_encoder = csv_encoder<7>; -using list_basic_metrics_csv_encoder = csv_encoder<5>; -using list_derived_metrics_csv_encoder = csv_encoder<5>; -using scratch_memory_encoder = csv_encoder<8>; -using stats_csv_encoder = csv_encoder<8>; +using api_csv_encoder = csv_encoder<7>; +using agent_info_csv_encoder = csv_encoder<53>; +using kernel_trace_csv_encoder = csv_encoder<18>; +using counter_collection_csv_encoder = csv_encoder<18>; +using memory_copy_csv_encoder = csv_encoder<7>; +using memory_allocation_csv_encoder = csv_encoder<8>; +using marker_csv_encoder = csv_encoder<7>; +using list_basic_metrics_csv_encoder = csv_encoder<5>; +using list_derived_metrics_csv_encoder = csv_encoder<5>; +using scratch_memory_encoder = csv_encoder<8>; +using stats_csv_encoder = csv_encoder<8>; +using pc_sampling_host_trap_csv_encoder = csv_encoder<6>; } // namespace csv } // namespace tool } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/output/domain_type.cpp b/projects/rocprofiler-sdk/source/lib/output/domain_type.cpp old mode 100755 new mode 100644 index 8d281a86f7..f476189150 --- a/projects/rocprofiler-sdk/source/lib/output/domain_type.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/domain_type.cpp @@ -57,6 +57,10 @@ DEFINE_BUFFER_TYPE_NAME(MEMORY_ALLOCATION, "memory_allocation", "memory_allocation_stats") DEFINE_BUFFER_TYPE_NAME(COUNTER_VALUES, "COUNTER_VALUES", "counter_values", "no_filename") +DEFINE_BUFFER_TYPE_NAME(PC_SAMPLING_HOST_TRAP, + "PC_SAMPLING_HOST_TRAP", + "pc_sampling_host_trap", + "pc_sampling_host_trap_stats") #undef DEFINE_BUFFER_TYPE_NAME diff --git a/projects/rocprofiler-sdk/source/lib/output/domain_type.hpp b/projects/rocprofiler-sdk/source/lib/output/domain_type.hpp index 8fc8ced370..28b41b376a 100644 --- a/projects/rocprofiler-sdk/source/lib/output/domain_type.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/domain_type.hpp @@ -36,6 +36,7 @@ enum class domain_type RCCL, MEMORY_ALLOCATION, COUNTER_VALUES, + PC_SAMPLING_HOST_TRAP, LAST, }; diff --git a/projects/rocprofiler-sdk/source/lib/output/generateCSV.cpp b/projects/rocprofiler-sdk/source/lib/output/generateCSV.cpp old mode 100755 new mode 100644 index a95965d338..f1ea6c64a2 --- a/projects/rocprofiler-sdk/source/lib/output/generateCSV.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateCSV.cpp @@ -717,6 +717,65 @@ generate_csv(const output_config& cfg, } } +void +generate_csv(const output_config& cfg, + const metadata& tool_metadata, + const generator& data, + const stats_entry_t& stats) +{ + if(data.empty()) return; + + if(cfg.stats && stats) + write_stats(get_stats_output_file(cfg, domain_type::PC_SAMPLING_HOST_TRAP), stats.entries); + + auto ofs = tool::csv_output_file{cfg, + domain_type::PC_SAMPLING_HOST_TRAP, + tool::csv::pc_sampling_host_trap_csv_encoder{}, + {"Sample_Timestamp", + "Exec_Mask", + "Dispatch_Id", + "Instruction", + "Instruction_Comment", + "Correlation_Id"}}; + for(auto ditr : data) + { + for(const auto& record : data.get(ditr)) + { + if(record.inst_index == -1) + { + auto row_ss = std::stringstream{}; + std::string inst_comment = + "Unrecognized code object id, physical virtual address of PC:" + + std::to_string(record.pc_sample_record.pc.code_object_offset); + rocprofiler::tool::csv::pc_sampling_host_trap_csv_encoder::write_row( + row_ss, + record.pc_sample_record.timestamp, + record.pc_sample_record.exec_mask, + record.pc_sample_record.dispatch_id, + "", + inst_comment, + record.pc_sample_record.correlation_id.internal); + + ofs << row_ss.str(); + } + else + { + auto row_ss = std::stringstream{}; + rocprofiler::tool::csv::pc_sampling_host_trap_csv_encoder::write_row( + row_ss, + record.pc_sample_record.timestamp, + record.pc_sample_record.exec_mask, + record.pc_sample_record.dispatch_id, + tool_metadata.get_instruction(record.inst_index), + tool_metadata.get_comment(record.inst_index), + record.pc_sample_record.correlation_id.internal); + + ofs << row_ss.str(); + } + } + } +} + void generate_csv(const output_config& cfg, const metadata& /*tool_metadata*/, diff --git a/projects/rocprofiler-sdk/source/lib/output/generateCSV.hpp b/projects/rocprofiler-sdk/source/lib/output/generateCSV.hpp index c38f2d42f7..9c03a6821a 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateCSV.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateCSV.hpp @@ -92,6 +92,11 @@ generate_csv(const output_config& const metadata& tool_metadata, const generator& data, const stats_entry_t& stats); +void +generate_csv(const output_config& cfg, + const metadata& tool_metadata, + const generator& data, + const stats_entry_t& stats); void generate_csv(const output_config& cfg, diff --git a/projects/rocprofiler-sdk/source/lib/output/generateJSON.cpp b/projects/rocprofiler-sdk/source/lib/output/generateJSON.cpp index e1dbac5b96..e5e20c5172 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateJSON.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateJSON.cpp @@ -124,7 +124,13 @@ write_json(json_output& json_ar, json_ar.startNode(); json_ar(cereal::make_nvp("callback_records", callback_name_info)); json_ar(cereal::make_nvp("buffer_records", buffer_name_info)); + json_ar( + cereal::make_nvp("pc_sample_instructions", tool_metadata.get_pc_sample_instructions())); + json_ar(cereal::make_nvp("pc_sample_comments", tool_metadata.get_pc_sample_comments())); json_ar(cereal::make_nvp("marker_api", marker_msg_data)); + json_ar( + cereal::make_nvp("pc_sample_instructions", tool_metadata.get_pc_sample_instructions())); + json_ar(cereal::make_nvp("pc_sample_comments", tool_metadata.get_pc_sample_comments())); { auto _extern_corr_id_strings = std::map{}; @@ -178,7 +184,8 @@ write_json(json_output& json_ar, generator marker_api_gen, generator scratch_memory_gen, generator rccl_api_gen, - generator memory_allocation_gen) + generator memory_allocation_gen, + generator pc_sampling_gen) { // summary @@ -219,6 +226,7 @@ write_json(json_output& json_ar, json_ar(cereal::make_nvp("memory_copy", memory_copy_gen)); json_ar(cereal::make_nvp("memory_allocation", memory_allocation_gen)); json_ar(cereal::make_nvp("scratch_memory", scratch_memory_gen)); + json_ar(cereal::make_nvp("pc_sample_host_trap", pc_sampling_gen)); json_ar.finishNode(); } } diff --git a/projects/rocprofiler-sdk/source/lib/output/generateJSON.hpp b/projects/rocprofiler-sdk/source/lib/output/generateJSON.hpp index 9effe0b7f8..93baa8c3fc 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateJSON.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateJSON.hpp @@ -93,6 +93,7 @@ write_json(json_output& json generator marker_api_gen, generator scratch_memory_gen, generator rccl_api_gen, - generator memory_allocation_gen); + generator memory_allocation_gen, + generator pc_sampling_gen); } // namespace tool } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/output/generateStats.cpp b/projects/rocprofiler-sdk/source/lib/output/generateStats.cpp index 50a7d338e6..a8ab73cb3f 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateStats.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateStats.cpp @@ -411,5 +411,13 @@ generate_stats(const output_config& cfg, if(cfg.stats_summary) generate_stats(cfg, _os, "SUMMARY", data_v, _indent); } + +stats_entry_t +generate_stats(const output_config& /* cfg*/, + const metadata& /*tool_metadata*/, + const generator& /*data*/) +{ + return stats_entry_t{}; +} } // namespace tool } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/output/generateStats.hpp b/projects/rocprofiler-sdk/source/lib/output/generateStats.hpp index 06f1597149..7a025a9192 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateStats.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateStats.hpp @@ -75,6 +75,10 @@ generate_stats(const output_config& cfg, const metadata& tool_metadata, const generator& data); +stats_entry_t +generate_stats(const output_config& cfg, + const metadata& tool_metadata, + const generator& data); void generate_stats(const output_config& cfg, const metadata& tool_metadata, diff --git a/projects/rocprofiler-sdk/source/lib/output/metadata.cpp b/projects/rocprofiler-sdk/source/lib/output/metadata.cpp index 4daefa9b48..01ed1ffb8f 100644 --- a/projects/rocprofiler-sdk/source/lib/output/metadata.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/metadata.cpp @@ -47,6 +47,20 @@ dimensions_info_callback(rocprofiler_counter_id_t /*id*/, dimensions_info->emplace_back(dim_info[j]); return ROCPROFILER_STATUS_SUCCESS; } + +rocprofiler_status_t +query_pc_sampling_configuration(const rocprofiler_pc_sampling_configuration_t* configs, + long unsigned int num_config, + void* user_data) +{ + auto* avail_configs = + static_cast*>(user_data); + for(size_t i = 0; i < num_config; i++) + { + avail_configs->emplace_back(configs[i]); + } + return ROCPROFILER_STATUS_SUCCESS; +} } // namespace kernel_symbol_info::kernel_symbol_info() @@ -78,7 +92,14 @@ metadata::metadata(inprocess) _gpu_agents.reserve(agents.size()); for(auto& itr : agents) { - if(itr.type == ROCPROFILER_AGENT_TYPE_GPU) _gpu_agents.emplace_back(&itr); + if(itr.type == ROCPROFILER_AGENT_TYPE_GPU) + { + _gpu_agents.emplace_back(&itr); + auto pc_configs = std::vector{}; + rocprofiler_query_pc_sampling_agent_configurations( + itr.id, query_pc_sampling_configuration, &pc_configs); + agent_pc_sample_config_info.emplace(itr.id, pc_configs); + } } // make sure they are sorted by node id @@ -112,6 +133,7 @@ void metadata::init(inprocess) void* user_data) { auto* data_v = static_cast(user_data); data_v->emplace(id, counter_info_vec_t{}); + for(size_t i = 0; i < num_counters; ++i) { auto _info = rocprofiler_counter_info_v0_t{}; @@ -260,6 +282,17 @@ metadata::get_gpu_agents() const return _data; } +pc_sample_config_vec_t +metadata::get_pc_sample_config_info(rocprofiler_agent_id_t _val) const +{ + auto _ret = pc_sample_config_vec_t{}; + auto pc_sample_config = agent_pc_sample_config_info.at(_val); + for(const auto& itr : pc_sample_config) + _ret.emplace_back(itr); + + return _ret; +} + counter_info_vec_t metadata::get_counter_info() const { @@ -417,5 +450,62 @@ metadata::get_string_entry(size_t key) const return ret; } + +int64_t +metadata::get_instruction_index(rocprofiler_pc_t record) +{ + inst_t ins; + ins.code_object_id = record.code_object_id; + ins.code_object_offset = record.code_object_offset; + auto itr = indexes.find(ins); + if(itr != indexes.end()) return itr->second; + auto idx = instruction_decoder.size(); + auto pc_instruction = decode_instruction(record); + instruction_decoder.emplace_back(pc_instruction->inst); + instruction_comment.emplace_back(pc_instruction->comment); + indexes.emplace(ins, idx); + return idx; +} + +void +metadata::add_decoder(rocprofiler_code_object_info_t* obj_data) +{ + if(obj_data->storage_type == ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_FILE) + { + decoder.wlock( + [](auto& _decoder, rocprofiler_code_object_info_t* obj_data_v) { + _decoder.addDecoder(obj_data_v->uri, + obj_data_v->code_object_id, + obj_data_v->load_delta, + obj_data_v->load_size); + }, + obj_data); + } + else + { + decoder.wlock( + [](auto& _decoder, rocprofiler_code_object_info_t* obj_data_v) { + _decoder.addDecoder( + // NOLINTBEGIN(performance-no-int-to-ptr) + reinterpret_cast(obj_data_v->memory_base), + // NOLINTEND(performance-no-int-to-ptr) + obj_data_v->memory_size, + obj_data_v->code_object_id, + obj_data_v->load_delta, + obj_data_v->load_size); + }, + obj_data); + } +} + +std::unique_ptr +metadata::decode_instruction(rocprofiler_pc_t pc) +{ + return decoder.wlock( + [](auto& _decoder, uint64_t id, uint64_t addr) { return _decoder.get(id, addr); }, + pc.code_object_id, + pc.code_object_offset); +} + } // namespace tool } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/output/metadata.hpp b/projects/rocprofiler-sdk/source/lib/output/metadata.hpp index e093a43449..a4405cde53 100644 --- a/projects/rocprofiler-sdk/source/lib/output/metadata.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/metadata.hpp @@ -25,6 +25,7 @@ #include "agent_info.hpp" #include "counter_info.hpp" #include "kernel_symbol_info.hpp" +#include "pc_sample_transform.hpp" #include "lib/common/container/small_vector.hpp" #include "lib/common/demangle.hpp" @@ -36,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -71,7 +73,8 @@ using marker_message_ordered_map_t = std::map; using string_entry_map_t = std::unordered_map>; using counter_dimension_vec_t = std::vector; using external_corr_id_set_t = std::unordered_set; - +using code_obj_decoder_t = rocprofiler::sdk::codeobj::disassembly::CodeobjAddressTranslate; +using instruction_t = rocprofiler::sdk::codeobj::disassembly::Instruction; template using synced_map = common::Synchronized; @@ -82,19 +85,21 @@ struct metadata struct inprocess {}; - pid_t process_id = 0; - uint64_t process_start_ns = 0; - uint64_t process_end_ns = 0; - agent_info_vec_t agents = {}; - agent_info_map_t agents_map = {}; - agent_counter_info_map_t agent_counter_info = {}; - sdk::buffer_name_info buffer_names = {}; - sdk::callback_name_info callback_names = {}; - synced_map code_objects = {}; - synced_map kernel_symbols = {}; - synced_map marker_messages = {}; - synced_map string_entries = {}; - synced_map external_corr_ids = {}; + pid_t process_id = 0; + uint64_t process_start_ns = 0; + uint64_t process_end_ns = 0; + agent_info_vec_t agents = {}; + agent_info_map_t agents_map = {}; + agent_counter_info_map_t agent_counter_info = {}; + agent_pc_sample_config_info_map_t agent_pc_sample_config_info = {}; + + sdk::buffer_name_info buffer_names = {}; + sdk::callback_name_info callback_names = {}; + synced_map code_objects = {}; + synced_map kernel_symbols = {}; + synced_map marker_messages = {}; + synced_map string_entries = {}; + synced_map external_corr_ids = {}; metadata() = default; metadata(inprocess); @@ -119,6 +124,13 @@ struct metadata agent_info_ptr_vec_t get_gpu_agents() const; counter_info_vec_t get_counter_info() const; counter_dimension_vec_t get_counter_dimension_info() const; + pc_sample_config_vec_t get_pc_sample_config_info(rocprofiler_agent_id_t _val) const; + std::vector get_pc_sample_instructions() const { return instruction_decoder; } + std::vector get_pc_sample_comments() const { return instruction_comment; } + std::string_view get_instruction(int64_t index) const { return instruction_decoder.at(index); } + std::string_view get_comment(int64_t index) const { return instruction_comment.at(index); } + int64_t get_instruction_index(rocprofiler_pc_t record); + void add_decoder(rocprofiler_code_object_info_t* obj_data_v); template Tp get_marker_messages(Tp&&); @@ -141,7 +153,13 @@ struct metadata const std::string* get_string_entry(size_t key) const; private: - bool inprocess_init = false; + bool inprocess_init = false; + std::unique_ptr decode_instruction(rocprofiler_pc_t pc); + synced_map decoder = {}; + // TODO: We may have to reserve the vector size based on map size + std::vector instruction_decoder = {}; + std::vector instruction_comment = {}; + std::map indexes = {}; }; template diff --git a/projects/rocprofiler-sdk/source/lib/output/pc_sample_transform.hpp b/projects/rocprofiler-sdk/source/lib/output/pc_sample_transform.hpp new file mode 100644 index 0000000000..ec8e9a62c9 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/output/pc_sample_transform.hpp @@ -0,0 +1,83 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include +#include +#include + +#include "lib/common/static_object.hpp" +#include "lib/common/synchronized.hpp" + +#include + +using pc_sample_config_vec_t = std::vector; +using agent_pc_sample_config_info_map_t = + std::unordered_map; + +namespace rocprofiler +{ +namespace tool +{ +struct inst_t +{ + uint64_t code_object_id; + uint64_t code_object_offset; + + bool operator==(const inst_t& inst) const + { + return this->code_object_id == inst.code_object_id && + this->code_object_offset == inst.code_object_offset; + } + + bool operator<(const inst_t& b) const + { + if(this->code_object_id == b.code_object_id) + return this->code_object_offset < b.code_object_offset; + return this->code_object_id < b.code_object_id; + }; +}; + +// TODO:: Check if we can template this structure +struct rocprofiler_tool_pc_sampling_host_trap_record_t +{ + rocprofiler_pc_sampling_record_host_trap_v0_t pc_sample_record; + int64_t inst_index; + + rocprofiler_tool_pc_sampling_host_trap_record_t( + rocprofiler_pc_sampling_record_host_trap_v0_t record, + int64_t index) + : pc_sample_record(record) + , inst_index(index) + {} + + template + void save(ArchiveT& ar) const + { + ar(cereal::make_nvp("record", pc_sample_record)); + ar(cereal::make_nvp("inst_index", inst_index)); + } +}; + +} // namespace tool +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/CMakeLists.txt index 229bd76e91..264eb310a2 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/CMakeLists.txt +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/CMakeLists.txt @@ -21,7 +21,9 @@ target_link_libraries( rocprofiler-sdk::rocprofiler-sdk-output-library rocprofiler-sdk::rocprofiler-sdk-cereal rocprofiler-sdk::rocprofiler-sdk-perfetto - rocprofiler-sdk::rocprofiler-sdk-otf2) + rocprofiler-sdk::rocprofiler-sdk-otf2 + rocprofiler-sdk::rocprofiler-sdk-dw + rocprofiler-sdk::rocprofiler-sdk-amd-comgr) set_target_properties( rocprofiler-sdk-tool diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp index d3740b1a4a..d401cb21f2 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp @@ -195,6 +195,22 @@ config::config() , counters{parse_counters(get_env("ROCPROF_COUNTERS", std::string{}))} { if(kernel_filter_include.empty()) kernel_filter_include = std::string{".*"}; + + std::unordered_map pc_sampling_unit_map = { + {"none", ROCPROFILER_PC_SAMPLING_UNIT_NONE}, + {"instructions", ROCPROFILER_PC_SAMPLING_UNIT_INSTRUCTIONS}, + {"cycles", ROCPROFILER_PC_SAMPLING_UNIT_CYCLES}, + {"time", ROCPROFILER_PC_SAMPLING_UNIT_TIME}}; + + std::unordered_map pc_sampling_method_map = + {{"none", ROCPROFILER_PC_SAMPLING_METHOD_NONE}, + {"stochastic", ROCPROFILER_PC_SAMPLING_METHOD_STOCHASTIC}, + {"host_trap", ROCPROFILER_PC_SAMPLING_METHOD_HOST_TRAP}}; + + pc_sampling_method_value = pc_sampling_method_map.at(pc_sampling_method); + if(pc_sampling_method_value == ROCPROFILER_PC_SAMPLING_METHOD_HOST_TRAP) + pc_sampling_host_trap = true; + pc_sampling_unit_value = pc_sampling_unit_map.at(pc_sampling_unit); } std::string diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp index f4034f7480..b7a2246c6b 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp @@ -75,29 +75,36 @@ struct config : output_config config& operator=(const config&) = default; config& operator=(config&&) noexcept = default; - bool demangle = get_env("ROCPROF_DEMANGLE_KERNELS", true); - bool truncate = get_env("ROCPROF_TRUNCATE_KERNELS", false); - bool kernel_trace = get_env("ROCPROF_KERNEL_TRACE", false); - bool hsa_core_api_trace = get_env("ROCPROF_HSA_CORE_API_TRACE", false); - bool hsa_amd_ext_api_trace = get_env("ROCPROF_HSA_AMD_EXT_API_TRACE", false); - bool hsa_image_ext_api_trace = get_env("ROCPROF_HSA_IMAGE_EXT_API_TRACE", false); - bool hsa_finalizer_ext_api_trace = get_env("ROCPROF_HSA_FINALIZER_EXT_API_TRACE", false); - bool marker_api_trace = get_env("ROCPROF_MARKER_API_TRACE", false); - bool memory_copy_trace = get_env("ROCPROF_MEMORY_COPY_TRACE", false); - bool memory_allocation_trace = get_env("ROCPROF_MEMORY_ALLOCATION_TRACE", false); - bool scratch_memory_trace = get_env("ROCPROF_SCRATCH_MEMORY_TRACE", false); - bool counter_collection = get_env("ROCPROF_COUNTER_COLLECTION", false); - bool hip_runtime_api_trace = get_env("ROCPROF_HIP_RUNTIME_API_TRACE", false); - bool hip_compiler_api_trace = get_env("ROCPROF_HIP_COMPILER_API_TRACE", false); - bool rccl_api_trace = get_env("ROCPROF_RCCL_API_TRACE", false); - bool list_metrics = get_env("ROCPROF_LIST_METRICS", false); - bool list_metrics_output_file = get_env("ROCPROF_OUTPUT_LIST_METRICS_FILE", false); + bool demangle = get_env("ROCPROF_DEMANGLE_KERNELS", true); + bool truncate = get_env("ROCPROF_TRUNCATE_KERNELS", false); + bool kernel_trace = get_env("ROCPROF_KERNEL_TRACE", false); + bool hsa_core_api_trace = get_env("ROCPROF_HSA_CORE_API_TRACE", false); + bool hsa_amd_ext_api_trace = get_env("ROCPROF_HSA_AMD_EXT_API_TRACE", false); + bool hsa_image_ext_api_trace = get_env("ROCPROF_HSA_IMAGE_EXT_API_TRACE", false); + bool hsa_finalizer_ext_api_trace = get_env("ROCPROF_HSA_FINALIZER_EXT_API_TRACE", false); + bool marker_api_trace = get_env("ROCPROF_MARKER_API_TRACE", false); + bool memory_copy_trace = get_env("ROCPROF_MEMORY_COPY_TRACE", false); + bool memory_allocation_trace = get_env("ROCPROF_MEMORY_ALLOCATION_TRACE", false); + bool scratch_memory_trace = get_env("ROCPROF_SCRATCH_MEMORY_TRACE", false); + bool counter_collection = get_env("ROCPROF_COUNTER_COLLECTION", false); + bool hip_runtime_api_trace = get_env("ROCPROF_HIP_RUNTIME_API_TRACE", false); + bool hip_compiler_api_trace = get_env("ROCPROF_HIP_COMPILER_API_TRACE", false); + bool rccl_api_trace = get_env("ROCPROF_RCCL_API_TRACE", false); + bool list_metrics = get_env("ROCPROF_LIST_METRICS", false); + bool list_metrics_output_file = get_env("ROCPROF_OUTPUT_LIST_METRICS_FILE", false); + bool pc_sampling_host_trap = false; + size_t pc_sampling_interval = get_env("ROCPROF_PC_SAMPLING_INTERVAL", 1); + rocprofiler_pc_sampling_method_t pc_sampling_method_value = ROCPROFILER_PC_SAMPLING_METHOD_NONE; + rocprofiler_pc_sampling_unit_t pc_sampling_unit_value = ROCPROFILER_PC_SAMPLING_UNIT_NONE; - int mpi_size = get_mpi_size(); - int mpi_rank = get_mpi_rank(); + std::string stats_summary_unit = get_env("ROCPROF_STATS_SUMMARY_UNITS", "nsec"); + int mpi_size = get_mpi_size(); + int mpi_rank = get_mpi_rank(); std::string kernel_filter_include = get_env("ROCPROF_KERNEL_FILTER_INCLUDE_REGEX", ".*"); std::string kernel_filter_exclude = get_env("ROCPROF_KERNEL_FILTER_EXCLUDE_REGEX", ""); + std::string pc_sampling_method = get_env("ROCPROF_PC_SAMPLING_METHOD", "none"); + std::string pc_sampling_unit = get_env("ROCPROF_PC_SAMPLING_UNIT", "none"); std::string extra_counters_contents = get_env("ROCPROF_EXTRA_COUNTERS_CONTENTS", ""); std::unordered_set kernel_filter_range = {}; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp old mode 100755 new mode 100644 index b97daeb899..dea1016baf --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp @@ -157,17 +157,19 @@ struct buffer_ids rocprofiler_buffer_id_t counter_collection = {}; rocprofiler_buffer_id_t scratch_memory = {}; rocprofiler_buffer_id_t rccl_api_trace = {}; + rocprofiler_buffer_id_t pc_sampling_host_trap = {}; auto as_array() const { - return std::array{hsa_api_trace, + return std::array{hsa_api_trace, hip_api_trace, kernel_trace, memory_copy_trace, memory_allocation_trace, counter_collection, scratch_memory, - rccl_api_trace}; + rccl_api_trace, + pc_sampling_host_trap}; } }; @@ -547,6 +549,10 @@ code_object_tracing_callback(rocprofiler_callback_tracing_record_t record, auto* obj_data = static_cast(record.payload); CHECK_NOTNULL(tool_metadata)->add_code_object(*obj_data); + if(tool::get_config().pc_sampling_host_trap) + { + CHECK_NOTNULL(tool_metadata)->add_decoder(obj_data); + } } else if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD) { @@ -801,6 +807,58 @@ get_device_counting_service(rocprofiler_agent_id_t agent_id) return profile; } +int64_t +get_instruction_index(rocprofiler_pc_t pc) +{ + if(pc.code_object_id == ROCPROFILER_CODE_OBJECT_ID_NONE) + return -1; + else + return CHECK_NOTNULL(tool_metadata)->get_instruction_index(pc); +} + +} // namespace + +void +rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /* context_id*/, + rocprofiler_buffer_id_t /* buffer_id*/, + rocprofiler_record_header_t** headers, + size_t num_headers, + void* /*data*/, + uint64_t /* drop_count*/) +{ + if(!headers) return; + + for(size_t i = 0; i < num_headers; i++) + { + auto* cur_header = headers[i]; + + if(cur_header == nullptr) + { + throw std::runtime_error{ + "rocprofiler provided a null pointer to header. this should never happen"}; + } + else if(cur_header->category == ROCPROFILER_BUFFER_CATEGORY_PC_SAMPLING) + { + if(cur_header->kind == ROCPROFILER_PC_SAMPLING_RECORD_HOST_TRAP_V0_SAMPLE) + { + auto* pc_sample = static_cast( + cur_header->payload); + + auto pc_sample_tool_record = + rocprofiler::tool::rocprofiler_tool_pc_sampling_host_trap_record_t( + *pc_sample, get_instruction_index(pc_sample->pc)); + + rocprofiler::tool::write_ring_buffer(pc_sample_tool_record, + domain_type::PC_SAMPLING_HOST_TRAP); + } + } + else + { + ROCP_FATAL << "unexpected rocprofiler_record_header_t category + kind"; + } + } +} + void dispatch_callback(rocprofiler_dispatch_counting_service_data_t dispatch_data, rocprofiler_profile_config_id_t* config, @@ -1018,6 +1076,26 @@ finalize_rocprofv3(std::string_view context) } } +bool +if_pc_sample_config_match(rocprofiler_agent_id_t agent_id, + rocprofiler_pc_sampling_method_t pc_sampling_method, + rocprofiler_pc_sampling_unit_t pc_sampling_unit, + uint64_t pc_sampling_interval) +{ + auto pc_sampling_config = CHECK_NOTNULL(tool_metadata)->get_pc_sample_config_info(agent_id); + if(!pc_sampling_config.empty()) + { + for(auto config : pc_sampling_config) + { + if(config.method == pc_sampling_method && config.unit == pc_sampling_unit && + config.min_interval <= pc_sampling_interval && + config.max_interval >= pc_sampling_interval) + return true; + } + } + return false; +} + int tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) { @@ -1277,6 +1355,44 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) "Could not configure external correlation id request service"); } + if(tool::get_config().pc_sampling_host_trap) + { + ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(), + buffer_size, + buffer_watermark, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + rocprofiler_pc_sampling_callback, + tool_data, + &get_buffers().pc_sampling_host_trap), + "buffer creation"); + bool config_match_found = false; + auto agent_ptr_vec = get_gpu_agents(); + for(auto& itr : agent_ptr_vec) + { + auto method = static_cast( + tool::get_config().pc_sampling_method_value); + auto unit = static_cast( + tool::get_config().pc_sampling_unit_value); + if(if_pc_sample_config_match( + itr->id, method, unit, tool::get_config().pc_sampling_interval)) + { + config_match_found = true; + int flags = 0; + ROCPROFILER_CALL(rocprofiler_configure_pc_sampling_service( + get_client_ctx(), + itr->id, + method, + unit, + tool::get_config().pc_sampling_interval, + get_buffers().pc_sampling_host_trap, + flags), + "configure PC sampling"); + } + } + if(!config_match_found) + ROCP_ERROR << "Given PC sampling configuration is not supported on any of the agents"; + } + for(auto itr : get_buffers().as_array()) { if(itr.handle > 0) @@ -1380,6 +1496,8 @@ tool_fini(void* /*tool_data*/) tool::memory_allocation_buffered_output_t{tool::get_config().memory_allocation_trace}; auto counters_records_output = tool::counter_records_buffered_output_t{tool::get_config().counter_collection}; + auto pc_sampling_host_trap_output = + tool::pc_sampling_host_trap_buffered_output_t{tool::get_config().pc_sampling_host_trap}; auto node_id_sort = [](const auto& lhs, const auto& rhs) { return lhs.node_id < rhs.node_id; }; @@ -1402,6 +1520,7 @@ tool_fini(void* /*tool_data*/) generate_output(rccl_output, contributions); generate_output(counters_output, contributions); generate_output(scratch_memory_output, contributions); + generate_output(pc_sampling_host_trap_output, contributions); if(tool::get_config().stats && tool::get_config().csv_output) { @@ -1426,7 +1545,8 @@ tool_fini(void* /*tool_data*/) marker_output.get_generator(), scratch_memory_output.get_generator(), rccl_output.get_generator(), - memory_allocation_output.get_generator()); + memory_allocation_output.get_generator(), + pc_sampling_host_trap_output.get_generator()); json_ar.finish_process(); tool::close_json(json_ar); @@ -1489,6 +1609,7 @@ tool_fini(void* /*tool_data*/) destroy_output(scratch_memory_output); destroy_output(rccl_output); destroy_output(counters_records_output); + destroy_output(pc_sampling_host_trap_output); if(destructors) { @@ -1502,7 +1623,6 @@ tool_fini(void* /*tool_data*/) __gcov_dump(); #endif } -} // namespace std::vector get_tool_counter_dimension_info() diff --git a/projects/rocprofiler-sdk/tests/bin/CMakeLists.txt b/projects/rocprofiler-sdk/tests/bin/CMakeLists.txt index 59ed2ebb59..f24d95fcb3 100644 --- a/projects/rocprofiler-sdk/tests/bin/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/bin/CMakeLists.txt @@ -25,3 +25,4 @@ add_subdirectory(page-migration) add_subdirectory(hsa-queue-dependency) add_subdirectory(hip-graph) add_subdirectory(hsa-memory-allocation) +add_subdirectory(pc-sampling) diff --git a/projects/rocprofiler-sdk/tests/bin/pc-sampling/CMakeLists.txt b/projects/rocprofiler-sdk/tests/bin/pc-sampling/CMakeLists.txt new file mode 100644 index 0000000000..505c80cfc5 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/bin/pc-sampling/CMakeLists.txt @@ -0,0 +1,11 @@ +# +# Integration test applications +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +project(rocprofiler-tests-bin LANGUAGES C CXX) + +set(CMAKE_BUILD_RPATH "\$ORIGIN:\$ORIGIN/../lib") + +# applications used by integration tests which DO NOT link to rocprofiler-sdk-roctx +add_subdirectory(exec-mask-manipulation) diff --git a/projects/rocprofiler-sdk/tests/bin/pc-sampling/exec-mask-manipulation/CMakeLists.txt b/projects/rocprofiler-sdk/tests/bin/pc-sampling/exec-mask-manipulation/CMakeLists.txt new file mode 100644 index 0000000000..0a97752330 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/bin/pc-sampling/exec-mask-manipulation/CMakeLists.txt @@ -0,0 +1,43 @@ +# +# +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +if(NOT CMAKE_HIP_COMPILER) + find_program( + amdclangpp_EXECUTABLE + NAMES amdclang++ + HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATH_SUFFIXES bin llvm/bin NO_CACHE) + mark_as_advanced(amdclangpp_EXECUTABLE) + + if(amdclangpp_EXECUTABLE) + set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}") + endif() +endif() + +project(rocprofiler-tests-bin-transpose LANGUAGES CXX HIP) + +foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) + if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "") + set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}") + endif() +endforeach() + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_HIP_STANDARD 17) +set(CMAKE_HIP_EXTENSIONS OFF) +set(CMAKE_HIP_STANDARD_REQUIRED ON) + +set_source_files_properties(exec_mask_manipulation.cpp PROPERTIES LANGUAGE HIP) +add_executable(exec-mask-manipulation) +target_sources(exec-mask-manipulation PRIVATE exec_mask_manipulation.cpp) +# debug symbols required for PC sampling decoding validation +target_compile_options(exec-mask-manipulation PRIVATE -W -Wall -Wextra -Wpedantic + -Wshadow -Werror -g) + +find_package(Threads REQUIRED) +target_link_libraries(exec-mask-manipulation PRIVATE Threads::Threads) diff --git a/projects/rocprofiler-sdk/tests/bin/pc-sampling/exec-mask-manipulation/exec_mask_manipulation.cpp b/projects/rocprofiler-sdk/tests/bin/pc-sampling/exec-mask-manipulation/exec_mask_manipulation.cpp new file mode 100644 index 0000000000..19d8934d1b --- /dev/null +++ b/projects/rocprofiler-sdk/tests/bin/pc-sampling/exec-mask-manipulation/exec_mask_manipulation.cpp @@ -0,0 +1,537 @@ +/* +Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include + +#include + +#define ITER_NUM 16 * 1024 +#define BLOCK_SIZE 1024 + +#define HIP_API_CALL(CALL) \ + { \ + hipError_t error_ = (CALL); \ + if(error_ != hipSuccess) \ + { \ + auto _hip_api_print_lk = auto_lock_t{print_lock}; \ + fprintf(stderr, \ + "%s:%d :: HIP error : %s\n", \ + __FILE__, \ + __LINE__, \ + hipGetErrorString(error_)); \ + throw std::runtime_error("hip_api_call"); \ + } \ + } + +namespace +{ +using auto_lock_t = std::unique_lock; +auto print_lock = std::mutex{}; + +void +check_hip_error(void); +} // namespace + +// ====================================================== +__global__ void +kernel1(const int c) +{ + int a = 0; +#pragma nounroll + for(int i = 0; i < ITER_NUM; i++) + { + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + } +} + +__global__ void +kernel2(const int c) +{ + int a = 0; +#pragma nounroll + for(int i = 0; i < ITER_NUM; i++) + { + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + } +} + +__global__ void +kernel3(const float c) +{ + double a = threadIdx.x; + float i = 0; + float d = 0; + float e = 0; + int tid_even = threadIdx.x % 2; + for(int j = 0; j < ITER_NUM; j++) + { + if(tid_even == 0) + { + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + } + else + { + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + } + } +} + +// ====================================================== + +void +run_kernel() +{ + for(int i = 1; i <= 64; i++) + { + if(i % 2 == 1) + kernel1<<>>(i); + else + kernel2<<>>(i); + + check_hip_error(); + HIP_API_CALL(hipDeviceSynchronize()); + } + + float arg = 0; + kernel3<<>>(arg); + check_hip_error(); + HIP_API_CALL(hipDeviceSynchronize()); +} + +int +main() +{ + run_kernel(); + return 0; +} + +namespace +{ +void +check_hip_error(void) +{ + hipError_t err = hipGetLastError(); + if(err != hipSuccess) + { + auto_lock_t _lk{print_lock}; + std::cerr << "Error: " << hipGetErrorString(err) << std::endl; + throw std::runtime_error("hip_api_call"); + } +} +} // namespace diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/CMakeLists.txt b/projects/rocprofiler-sdk/tests/rocprofv3/CMakeLists.txt index dac9c12e1f..8ed9eb62f9 100644 --- a/projects/rocprofiler-sdk/tests/rocprofv3/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/rocprofv3/CMakeLists.txt @@ -34,3 +34,4 @@ add_subdirectory(aborted-app) add_subdirectory(summary) add_subdirectory(roctracer-roctx) add_subdirectory(scratch-memory) +add_subdirectory(pc-sampling) diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/CMakeLists.txt b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/CMakeLists.txt new file mode 100644 index 0000000000..bd893c3c83 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/CMakeLists.txt @@ -0,0 +1,5 @@ +# +# PC sampling tests +# + +add_subdirectory(host-trap) diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/CMakeLists.txt b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/CMakeLists.txt new file mode 100644 index 0000000000..87d509b8f9 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/CMakeLists.txt @@ -0,0 +1,5 @@ +# +# PC sampling tests +# + +add_subdirectory(exec-mask-manipulation) diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/CMakeLists.txt b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/CMakeLists.txt new file mode 100644 index 0000000000..587ac89a71 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/CMakeLists.txt @@ -0,0 +1,150 @@ +# +# rocprofv3 tool test +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +project( + rocprofiler-tests-pc-sampling + LANGUAGES CXX + VERSION 0.0.0) + +find_package(rocprofiler-sdk REQUIRED) + +rocprofiler_configure_pytest_files(CONFIG pytest.ini COPY validate.py conftest.py + input.json input.yml) + +add_test( + NAME rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-cmd-execute + COMMAND + $ --pc-sampling-unit time + --pc-sampling-method host_trap --pc-sampling-interval 1 -d + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_cmd_input -o out --output-format csv json + -- $) + +string(REPLACE "LD_PRELOAD=" "ROCPROF_PRELOAD=" PRELOAD_ENV + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}") + +set(cc-env-pc-sampling "${PRELOAD_ENV}") + +set_tests_properties( + rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-cmd-execute + PROPERTIES TIMEOUT + 45 + LABELS + "integration-tests;pc-sampling" + ENVIRONMENT + "${cc-env-pc-sampling}" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + SKIP_REGULAR_EXPRESSION + "PC sampling unavailable") + +add_test( + NAME rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-json-execute + COMMAND + $ -i + ${CMAKE_CURRENT_BINARY_DIR}/input.json -d + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_json_input -- + $) + +set_tests_properties( + rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-json-execute + PROPERTIES TIMEOUT + 45 + LABELS + "integration-tests;pc-sampling" + ENVIRONMENT + "${cc-env-pc-sampling}" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + SKIP_REGULAR_EXPRESSION + "PC sampling unavailable") + +add_test( + NAME rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-yaml-execute + COMMAND + $ -i + ${CMAKE_CURRENT_BINARY_DIR}/input.yml -d + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_yaml_input -o out --output-format csv + json -- $) + +set_tests_properties( + rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-yaml-execute + PROPERTIES TIMEOUT + 45 + LABELS + "integration-tests;pc-sampling" + ENVIRONMENT + "${cc-env-pc-sampling}" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + SKIP_REGULAR_EXPRESSION + "PC sampling unavailable") + +# ========================= Validation tests + +add_test( + NAME rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-cmd-validate + COMMAND + ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py -k + test_validate_pc_sampling_exec_mask_manipulation_ --input-csv + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_cmd_input/out_pc_sampling_host_trap.csv + --input-json ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_cmd_input/out_results.json + --all-sampled False) + +set_tests_properties( + rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-cmd-validate + PROPERTIES TIMEOUT + 60 + LABELS + "integration-tests;pc-sampling" + DEPENDS + "rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-cmd-execute" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + SKIP_REGULAR_EXPRESSION + "PC sampling unavailable") + +add_test( + NAME rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-json-validate + COMMAND + ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py -k + test_validate_pc_sampling_exec_mask_manipulation_ --input-csv + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_json_input/out_pc_sampling_host_trap.csv + --input-json ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_json_input/out_results.json + --all-sampled False) + +set_tests_properties( + rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-json-validate + PROPERTIES TIMEOUT + 60 + LABELS + "integration-tests;pc-sampling" + DEPENDS + "rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-json-execute" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + SKIP_REGULAR_EXPRESSION + "PC sampling unavailable") + +add_test( + NAME rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-yaml-validate + COMMAND + ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py -k + test_validate_pc_sampling_exec_mask_manipulation_ --input-csv + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_yaml_input/out_pc_sampling_host_trap.csv + --input-json ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_yaml_input/out_results.json + --all-sampled False) + +set_tests_properties( + rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-yaml-validate + PROPERTIES TIMEOUT + 60 + LABELS + "integration-tests;pc-sampling" + DEPENDS + "rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-yaml-execute" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + SKIP_REGULAR_EXPRESSION + "PC sampling unavailable") diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/conftest.py b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/conftest.py new file mode 100644 index 0000000000..5e182db9cf --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/conftest.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 + +import json +import os +import pytest +import pandas as pd + +from rocprofiler_sdk.pytest_utils.dotdict import dotdict +from rocprofiler_sdk.pytest_utils import collapse_dict_list + + +def pytest_addoption(parser): + parser.addoption( + "--input-csv", + action="store", + help="Path to CSV file.", + ) + + parser.addoption( + "--input-json", + action="store", + help="Path to CSV file.", + ) + + parser.addoption( + "--all-sampled", + action="store", + help="All SW and HW units must be sampled.", + ) + + +@pytest.fixture +def input_csv(request): + filename = request.config.getoption("--input-csv") + if not os.path.isfile(filename): + # The CSV file is not generated, because the dependency test + # responsible to generate this file was skipped or failed. + # Thus emit the message to skip this test as well. + print("PC sampling unavailable") + else: + with open(filename, "r") as inp: + return pd.read_csv( + inp, + na_filter=False, # parse empty fields as "" + keep_default_na=False, # parse empty fields as "" + dtype={ + "Exec_Mask": "uint64", + "Instruction": str, + "Instruction_Comment": str, + }, + ) + + +@pytest.fixture +def input_json(request): + filename = request.config.getoption("--input-json") + with open(filename, "r") as inp: + # Significant overhead of 5-6secs observed when feeding + # data into the dotdict. + # Using plain python dict instead + return collapse_dict_list(json.load(inp)) + + +@pytest.fixture +def all_sampled(request): + _all_sampled_str = request.config.getoption("--all-sampled") + return _all_sampled_str == "True" diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/input.json b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/input.json new file mode 100644 index 0000000000..80bf53eb37 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/input.json @@ -0,0 +1,14 @@ +{ + "jobs": [ + { + "pc_sampling_unit": "time", + "pc_sampling_method": "host_trap", + "pc_sampling_interval": 1, + "output_file": "out", + "output_format": [ + "csv", + "json" + ] + } + ] +} diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/input.yml b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/input.yml new file mode 100644 index 0000000000..692e2ed67a --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/input.yml @@ -0,0 +1,4 @@ +jobs: + - pc_sampling_unit: "time" + pc_sampling_method: "host_trap" + pc_sampling_interval: 1 diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/pytest.ini b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/pytest.ini new file mode 100644 index 0000000000..5e1e1c14a0 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/pytest.ini @@ -0,0 +1,5 @@ + +[pytest] +addopts = --durations=20 -rA -s -vv +testpaths = validate.py +pythonpath = @ROCPROFILER_SDK_TESTS_BINARY_DIR@/pytest-packages diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/validate.py b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/validate.py new file mode 100644 index 0000000000..7ccd72d98e --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/validate.py @@ -0,0 +1,376 @@ +#!/usr/bin/env python3 + +import itertools +import sys +import pytest +import numpy as np +import pandas as pd + + +# =========================== Validating CSV output + + +# Keep this in case we decide to revert workgroup_id information +def validate_workgoup_id_x_y_z(df, max_x, max_y, max_z): + assert (df["Workgroup_Size_X"].astype(int) >= 0).all() + assert (df["Workgroup_Size_X"].astype(int) <= max_x).all() + + assert (df["Workgroup_Size_Y"].astype(int) >= 0).all() + assert (df["Workgroup_Size_Y"].astype(int) <= max_y).all() + + assert (df["Workgroup_Size_Z"].astype(int) >= 0).all() + assert (df["Workgroup_Size_Z"].astype(int) <= max_z).all() + + +# Keep this in case we decide to revert wave_id information +def validate_wave_id(df, max_wave_id): + assert (df["Wave_Id"].astype(int) <= max_wave_id).all() + + +# Keep this in case we decide to revert wave_id information +def validate_chiplet(df, max_chiplet): + assert (df["Chiplet"].astype(int) <= max_chiplet).all() + + +def validate_instruction_decoding( + df, + inst_str, + exec_mask_uint64: np.uint64 = None, + source_code_lines_range: (int, int) = None, + all_source_lines_samples=False, +): + # Make a copy, so that we don't work (modify) a view. + df_inst = df[df["Instruction"].apply(lambda inst: inst.startswith(inst_str))].copy() + + assert not df_inst.empty + # assert the exec mask if requested + if exec_mask_uint64 is not None: + assert (df_inst["Exec_Mask"].astype(np.uint64) == exec_mask_uint64).all() + + # assert whether the samples source code lines belongs to the provided range + if source_code_lines_range is not None: + start_range, end_range = source_code_lines_range + # The instruction comment is isually in the following format: /path/to/source/file.cpp:line_num + df_inst["source_line_num"] = df_inst["Instruction_Comment"].apply( + lambda source_line: int(source_line.split(":")[-1]) + ) + assert (df_inst["source_line_num"] >= start_range).all() + assert (df_inst["source_line_num"] <= end_range).all() + # if requested, check if all lines from the range are sampled + if all_source_lines_samples: + assert len(df_inst["source_line_num"].unique()) == ( + end_range - start_range + 1 + ) + + +def validate_instruction_comment(df): + # Instruction comment must always be present, since the testing application + # is built with debug symbols. + assert ( + (df["Instruction_Comment"] != "") & (df["Instruction_Comment"] != "nullptr") + ).all() + + +def validate_instruction_correlation_id_relation(df): + # Samples with no decoded instructions originates from either + # blit kernels or self modifying code. The correlation id for this + # type of samples should alway be zero. + # Thus, Correlation_Id is 0 `iff`` instruction is not decoded. + + # The previous statement has two implications. + # Implication 1: If the instruction is not decoded, then correlation id is 0. + samples_no_instruction_df = df[ + (df["Instruction"] == "") | (df["Instruction"] == "nullptr") + ] + assert (samples_no_instruction_df["Correlation_Id"] == 0).all() + + # Implication 2: If the correlation id is 0, then the instruction is not decoded. + samples_cid_zero_df = df[df["Correlation_Id"] == 0] + assert ( + (samples_cid_zero_df["Instruction"] == "") + | (samples_cid_zero_df["Instruction"] == "nullptr") + ).all() + + assert len(samples_no_instruction_df) == len(samples_cid_zero_df) + + # Since we're not enabling any kind of API tracing, + # internal correlation id should match the dispatch id + assert all(df["Correlation_Id"] == df["Dispatch_Id"]) + + +def validate_exec_mask_based_on_correlation_id(df): + # The function assumes that each kernel launches 1024 blocks. + # Each block contains number of threads that matches correlation ID of the kernel. + # The exec mask of a sample should contain number of ones equal to + # the correlation ID of the kernel during which execution the sample was generated. + df["active_SIMD_threads"] = df["Exec_Mask"].apply( + lambda exec_mask: bin(exec_mask).count("1") + ) + assert (df["active_SIMD_threads"] == df["Correlation_Id"]).all() + + # TODO: Comment out the following code if it causes spurious fails. + # The more conservative constraint based on the experience follows. + # The exec mask of sampled instructions of the kernels respect the following pattern: + # cid -> exec + # 1 -> 0b1 + # 2 -> 0b11 + # 3 -> 0b111 + # ... + # 64 -> 0xffffffffffffffff + + df["Exec_Mask2"] = ( + df["Correlation_Id"].astype(int).apply(lambda x: int("0b" + (x * "1"), 2)) + ) + + # TODO: exec should be in hex and that will ease the comparison + assert (df["Exec_Mask"].astype(np.uint64) == df["Exec_Mask2"].astype(np.uint64)).all() + + +def exec_mask_manipulation_validate_csv(df, all_sampled=False): + assert not df.empty + + validate_instruction_comment(df) + validate_instruction_correlation_id_relation(df) + + # Validate samples with non-zero correlation IDs (and with decoded instructions) + samples_cid_non_zero_df = df[df["Correlation_Id"] != 0] + + # exactly 65 kernels and 65 correlation id + assert (samples_cid_non_zero_df["Correlation_Id"].astype(int) >= 1).all() + assert (samples_cid_non_zero_df["Correlation_Id"].astype(int) <= 65).all() + if all_sampled: + # all correlation IDs must be sampled + assert len(samples_cid_non_zero_df["Correlation_Id"].astype(int).unique()) == 65 + + first_64_kernels_df = samples_cid_non_zero_df[ + samples_cid_non_zero_df["Correlation_Id"] <= 64 + ] + + # Make a copy, so that we don't work (modify) a view. + validate_exec_mask_based_on_correlation_id(first_64_kernels_df.copy()) + + # validate the last kernel + kernel_65_df = df[df["Correlation_Id"] == 65] + + # assert that v_rcp instructions are properly decoded + # the v_rcp is executed by even SIMD threads + validate_instruction_decoding( + kernel_65_df, + "v_rcp_f64", + exec_mask_uint64=np.uint64(int("5555555555555555", 16)), + source_code_lines_range=(288, 387), + all_source_lines_samples=all_sampled, + ) + + # assert that v_fmac_f32 instructions are properly decoded + # the v_fmac_f32 is executed by odd SIMD threads + validate_instruction_decoding( + kernel_65_df, + "v_fmac_f32", + exec_mask_uint64=np.uint64(int("AAAAAAAAAAAAAAAA", 16)), + source_code_lines_range=(391, 490), + all_source_lines_samples=all_sampled, + ) + + +def test_validate_pc_sampling_exec_mask_manipulation_csv( + input_csv: pd.DataFrame, all_sampled: bool +): + exec_mask_manipulation_validate_csv(input_csv, all_sampled=all_sampled) + + +# ========================= Validating JSON output + + +def validate_json_exec_mask_manipulation(data_json, all_sampled=False): + # Although functional programming might look more elegant, + # I was trying to avoid multiple iteration over the list of samples. + # Thus, I decided to use procedural programming instead. + # Although, it would be more elegant to wrap some of the checks in dedicated functions, + # I noticed that it can introduce significant overhead, so I decided to inline those checks. + + # the function assume homogenous system + agents = data_json["agents"] + gpu_agents = list(filter(lambda agent: agent["type"] == 2, agents)) + # There should be at least one GPU agent + assert len(gpu_agents) > 0 + first_gpu_agent = gpu_agents[0] + num_xcc = first_gpu_agent["num_xcc"] + max_waves_per_simd = first_gpu_agent["max_waves_per_simd"] + simd_per_cu = first_gpu_agent["simd_per_cu"] + + instructions = data_json["strings"]["pc_sample_instructions"] + comments = data_json["strings"]["pc_sample_comments"] + + # execution mask where even SIMD lanes are active + # correspond to the v_rcp_f64 instructions of the last kernel + even_simds_active_exec_mask = np.uint64(int("5555555555555555", 16)) + # start and end source code lines of the v_rcp_f64 instructions of the last kernel + v_rcp_f64_start_line_num, v_rcp_f64_end_line_num = 288, 387 + # execution mask where even SIMD lanes are active + # correspond to the v_rcp_f64 instructions of the last kernel + odd_simds_active_exec_mask = np.uint64(int("AAAAAAAAAAAAAAAA", 16)) + # start and end source code lines of the v_fmac_f32 0 instructions of the last kernel + v_fmac_f32_start_line_num, v_fmac_f32_end_line_num = 391, 490 + + # sampled wave_ids of the last kernel + kernel65_sampled_wave_in_grp = set() + # sampled source lines of the last kernel matching v_rcp_f64 instructions + kernel65_v_rcp_64_sampled_source_line_set = set() + # sampled source lines of the last kernel matching v_rcp_f64 instructions + kernel65_v_fmac_f32_sampled_source_line_set = set() + # sampled correlation IDs + sampled_cids_set = set() + # pairs of sampled SIMD ids and waveslot IDs + sampled_simd_waveslots_pairs = set() + # sampled chiplets + sampled_chiplets = set() + # sample VMIDs + sampled_vmids = set() + + for sample in data_json["buffer_records"]["pc_sample_host_trap"]: + record = sample["record"] + cid = record["corr_id"]["internal"] + + # pull information from hw_id + hw_id = record["hw_id"] + sampled_chiplets.add(hw_id["chiplet"]) + sampled_simd_waveslots_pairs.add((hw_id["simd_id"], hw_id["wave_id"])) + sampled_vmids.add(hw_id["vm_id"]) + + # Checks specific for all samples + + # cids must be non-negative numbers + assert cid >= 0 + + inst_index = sample["inst_index"] + + # Since we're not enabling any kind of API tracing, the internal correlation id should + # be equal to the dispatch_id + assert cid == record["dispatch_id"] + + if cid == 0: + # Samples originates either from a blit kernel or self-modifying code. + # Thus, code object is uknown, as well as the instruction. + assert record["pc"]["code_object_id"] == 0 + assert inst_index == -1 + else: + # Update set of sampled cids + sampled_cids_set.add(cid) + + # All samples with non-zero correlation ID should pass the following checks + # code object is know, so as the instruction + assert record["pc"]["code_object_id"] != 0 + assert inst_index != -1 + + wgid = record["wrkgrp_id"] + # check corrdinates of the workgroup + assert wgid["x"] >= 0 and wgid["x"] <= 1023 + assert wgid["y"] == 0 + assert wgid["z"] == 0 + + wave_in_grp = record["wave_in_grp"] + exec_mask = record["exec_mask"] + + if cid < 65: + # checks specific for samples from first 64 kernels + assert wave_in_grp == 0 + # inline if possible + # validate_json_exec_mask_based_on_cid(sample.record) + + # The function assumes that each kernel launches 1024 blocks. + # Each block contains number of threads that matches correlation ID of the kernel. + # The exec mask of a sample should contain number of ones equal to + # the correlation ID of the kernel during which execution the sample was generated. + assert bin(exec_mask).count("1") == cid + + # TODO: Comment out the following code if it causes spurious fails. + # The more conservative constraint based on the experience follows. + # The exec mask of sampled instructions of the kernels respect the following pattern: + # cid -> exec + # 1 -> 0b1 + # 2 -> 0b11 + # 3 -> 0b111 + # ... + # 64 -> 0xffffffffffffffff + exec_mask_str = "0b" + "1" * cid + assert np.uint64(exec_mask) == np.uint64(int(exec_mask_str, 2)) + else: + # No more that 65 cids + assert cid == 65 + # Monitor wave_in_group being sampled + kernel65_sampled_wave_in_grp.add(wave_in_grp) + # chekcs specific for samples from the last kernel + assert wave_in_grp >= 0 and wave_in_grp <= 3 + + # validate instruction decoding + inst = instructions[inst_index] + comm = comments[inst_index] + # The instruction comment is isually in the following format: + # /path/to/source/file.cpp:line_num + line_num = int(comm.split(":")[-1]) + if inst.startswith("v_rcp_f64"): + # even SIMD lanes active + assert np.uint64(exec_mask) == even_simds_active_exec_mask + assert ( + line_num >= v_rcp_f64_start_line_num + and line_num <= v_rcp_f64_end_line_num + ) + kernel65_v_rcp_64_sampled_source_line_set.add(line_num) + elif inst.startswith("v_fmac_f32"): + # odd SIMD lanes active + assert np.uint64(exec_mask) == odd_simds_active_exec_mask + assert ( + line_num >= v_fmac_f32_start_line_num + and line_num <= v_fmac_f32_end_line_num + ) + kernel65_v_fmac_f32_sampled_source_line_set.add(line_num) + + if all_sampled: + # All cids that belongs to the range [1, 65] should be samples + assert len(sampled_cids_set) == 65 + + # all wave_ids that belongs to the range [0, 3] should be sampled for the last kernel + assert len(kernel65_sampled_wave_in_grp) == 4 + + # all source lines matches v_rcp_f64 instructions of the last kernel should be sampled + assert len(kernel65_v_rcp_64_sampled_source_line_set) == ( + v_rcp_f64_end_line_num - v_rcp_f64_start_line_num + 1 + ) + # all source lines matches v_fmac_f32 instructions of the last kernel should be sampled + assert len(kernel65_v_fmac_f32_sampled_source_line_set) == ( + v_fmac_f32_end_line_num - v_fmac_f32_start_line_num + 1 + ) + + # all chiplets must be sampled + assert len(sampled_chiplets) == num_xcc + # all (simd ID, waveslot ID) pairs must be samples + assert len(sampled_simd_waveslots_pairs) == simd_per_cu * max_waves_per_simd + + # assert chiplet index + assert all(map(lambda chiplet: 0 <= chiplet < num_xcc, sampled_chiplets)) + # assert (SIMD ID, waveslot ID) combinations + assert all( + map( + lambda simd_waveslot: (0 <= simd_waveslot[0] < simd_per_cu) + and (0 <= simd_waveslot[1] < max_waves_per_simd), + sampled_simd_waveslots_pairs, + ) + ) + # all samples should belong to the same VMID + assert len(sampled_vmids) == 1 + + +def test_validate_pc_sampling_exec_mask_manipulation_json( + input_json, input_csv: pd.DataFrame, all_sampled: bool +): + data = input_json["rocprofiler-sdk-tool"] + # The same amount of samples should be in both CSV and JSON files. + assert len(input_csv) == len(data["buffer_records"]["pc_sample_host_trap"]) + # # validating JSON output + validate_json_exec_mask_manipulation(data, all_sampled=all_sampled) + + +if __name__ == "__main__": + exit_code = pytest.main(["-x", __file__] + sys.argv[1:]) + sys.exit(exit_code)