diff --git a/projects/rocprofiler-sdk/CHANGELOG.md b/projects/rocprofiler-sdk/CHANGELOG.md index 4145140d01..ca9c4e5bd2 100644 --- a/projects/rocprofiler-sdk/CHANGELOG.md +++ b/projects/rocprofiler-sdk/CHANGELOG.md @@ -109,6 +109,7 @@ Full documentation for ROCprofiler-SDK is available at [rocm.docs.amd.com/projec - Check to force tools to initialize context id with zero - Support to specify hardware counters for collection using rocprofv3 as `rocprofv3 --pmc [COUNTER [COUNTER ...]]` - Memory Allocation Tracing +- PC sampling tool support with CSV and JSON output formats ### Changed diff --git a/projects/rocprofiler-sdk/source/bin/rocprofv3.py b/projects/rocprofiler-sdk/source/bin/rocprofv3.py index 8209b04eec..69fa85587e 100755 --- a/projects/rocprofiler-sdk/source/bin/rocprofv3.py +++ b/projects/rocprofiler-sdk/source/bin/rocprofv3.py @@ -4,6 +4,7 @@ import os import sys import argparse import subprocess +import numpy class dotdict(dict): @@ -167,6 +168,30 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins help="Collect tracing data for HIP API, HSA API, Marker (ROCTx) API, RCCL API, Memory operations (copies, scratch, and allocations), and Kernel dispatches.", ) + pc_sampling_options = parser.add_argument_group("PC sampling options") + + pc_sampling_options.add_argument( + "--pc-sampling-unit", + help="", + default=None, + type=str.lower, + choices=("instructions", "cycles", "time"), + ) + + pc_sampling_options.add_argument( + "--pc-sampling-method", + help="", + default=None, + type=str.lower, + choices=("stochastic", "host_trap"), + ) + + pc_sampling_options.add_argument( + "--pc-sampling-interval", + help="", + default=None, + type=numpy.uint64, + ) basic_tracing_options = parser.add_argument_group("Basic tracing options") # Add the arguments @@ -904,6 +929,18 @@ def run(app_args, args, **kwargs): if args.log_level in ("info", "trace", "env"): log_config(app_env) + if args.pc_sampling_unit or args.pc_sampling_method or args.pc_sampling_method: + if not ( + args.pc_sampling_unit and args.pc_sampling_method and args.pc_sampling_method + ): + fatal_error("All three PC sampling configurations need to be set") + + update_env("ROCPROFILER_PC_SAMPLING_BETA_ENABLED", "ON") + update_env("ROCPROF_PC_SAMPLING_UNIT", args.pc_sampling_unit) + update_env("ROCPROF_PC_SAMPLING_METHOD", args.pc_sampling_method) + update_env("ROCPROF_PC_SAMPLING_INTERVAL", args.pc_sampling_interval) + update_env("ROCPROF_ENABLE_PC_SAMPLING", args.pc_sampling) + if use_execv: # does not return os.execvpe(app_args[0], app_args, env=app_env) diff --git a/projects/rocprofiler-sdk/source/docs/how-to/using-rocprofv3.rst b/projects/rocprofiler-sdk/source/docs/how-to/using-rocprofv3.rst index 70a945fdc4..5f1adf7ae4 100644 --- a/projects/rocprofiler-sdk/source/docs/how-to/using-rocprofv3.rst +++ b/projects/rocprofiler-sdk/source/docs/how-to/using-rocprofv3.rst @@ -162,6 +162,18 @@ Here is the sample of commonly used ``rocprofv3`` command-line options. Some opt - Perfetto shared memory size hint in KB. default: 64 KB - Extension + * - ``--pc-sampling-method`` + - Type of PC Sampling, currently only host trap method is supported + - PC Sampling Configurations + + * - ``--pc-sampling-unit`` + - The unit appropriate to the PC sampling type/method, currently only time unit is supported + - PC Sampling Configurations + + * - ``--pc-sampling-interval`` + - Frequency at which PC samples are generated + - PC Sampling Configurations + To see exhaustive list of ``rocprofv3`` options, run: .. code-block:: bash @@ -675,6 +687,9 @@ Properties trace. - **``preload``** *(array)*: Libraries to prepend to LD_PRELOAD (usually for sanitizers). + - **``pc_sampling_unit``** *(string)*: pc sampling unit. + - **``pc_sampling_method``** *(string)*: pc sampling method. + - **``pc_sampling_interval``** *(integer)*: pc sampling interval. .. code-block:: shell @@ -1039,6 +1054,14 @@ Properties - **`id`** *(integer, required)*: Dimension ID. - **`instance_size`** *(integer, required)*: Size of the instance. - **`name`** *(string, required)*: Name of the dimension. + - **``pc_sample_instructions``** *(array)*: Array of decoded + instructions matching sampled PCs from pc_sample_host_trap + section. + - **``pc_sample_comments``** *(array)*: Comments matching + assembly instructions from pc_sample_instructions array. If + debug symbols are available, comments provide instructions + to source-line mapping. Otherwise, a comment is an empty + string. - **`code_objects`** *(array, required)*: Code object records. - **Items** *(object)* - **`size`** *(integer, required)*: Size of the code object. @@ -1103,6 +1126,37 @@ Properties - **`arch_vgpr_count`** *(integer, required)*: Count of VGPRs. - **`sgpr_count`** *(integer, required)*: Count of SGPRs. - **`lds_block_size_v`** *(integer, required)*: Size of LDS block. + - **``pc_sample_host_trap``** *(array)*: Host Trap PC Sampling records. + - **Items** *(object)* + - **``hw_id``** *(object)*: Describes hardware part on which sampled wave was running. + - **``chiplet``** *(integer)*: Chiplet index. + - **``wave_id``** *(integer)*: Wave slot index. + - **``simd_id``** *(integer)*: SIMD index. + - **``pipe_id``** *(integer)*: Pipe index. + - **``cu_or_wgp_id``** *(integer)*: Index of compute unit or workgroup processer. + - **``shader_array_id``** *(integer)*: Shader array index. + - **``shader_engine_id``** *(integer)*: Shader engine + index. + - **``workgroup_id``** *(integer)*: Workgroup position in the 3D. + - **``vm_id``** *(integer)*: Virtual memory ID. + - **``queue_id``** *(integer)*: Queue id. + - **``microengine_id``** *(integer)*: ACE + (microengine) index. + - **``pc``** *(object)*: Encapsulates information about + sampled PC. + - **``code_object_id``** *(integer)*: Code object id. + - **``code_object_offset``** *(integer)*: Offset within the object if the latter is known. Otherwise, virtual address of the PC. + - **``exec_mask``** *(integer)*: Execution mask indicating active SIMD lanes of sampled wave. + - **``timestamp``** *(integer)*: Timestamp. + - **``dispatch_id``** *(integer)*: Dispatch id. + - **``correlation_id``** *(object)*: Correlation ID information. + - **``internal``** *(integer)*: Internal correlation ID. + - **``external``** *(integer)*: External correlation ID. + - **``rocprofiler_dim3_t``** *(object)*: Position of the workgroup in 3D grid. + - **``x``** *(integer)*: Dimension x. + - **``y``** *(integer)*: Dimension y. + - **``z``** *(integer)*: Dimension z. + - **``wave_in_group``** *(integer)*: Wave position within the workgroup (0-31). - **`buffer_records`** *(object, required)*: Buffer record details. - **`kernel_dispatch`** *(array)*: Kernel dispatch records. - **Items** *(object)* diff --git a/projects/rocprofiler-sdk/source/docs/rocprofv3-schema.json b/projects/rocprofiler-sdk/source/docs/rocprofv3-schema.json index f9438dc752..4cbd97aa7b 100644 --- a/projects/rocprofiler-sdk/source/docs/rocprofv3-schema.json +++ b/projects/rocprofiler-sdk/source/docs/rocprofv3-schema.json @@ -559,6 +559,14 @@ "required": [ "dimension_ids" ] + }, + "pc_sample_instructions": { + "type": "array", + "description": "Array of decoded instructions matching sampled PCs from pc_sample_host_trap section." + }, + "pc_sample_comments": { + "type": "array", + "description": "Comments matching assembly instructions from pc_sample_instructions array. If debug symbols are available, comments provide instructions to source-line mapping. Otherwise, a comment is an empty string." } } }, @@ -925,7 +933,129 @@ "lds_block_size_v" ] } - } + }, + "pc_sample_host_trap": { + "type": "array", + "description": "Host Trap PC Sampling records.", + "items": { + "type": "object", + "properties": { + "hw_id": { + "type": "object", + "description" : "Describes hardware part on which sampled wave was running.", + "properties": { + "chiplet":{ + "type": "integer", + "description": "Chiplet index." + }, + "wave_id ":{ + "type": "integer", + "description": "Wave slot index." + }, + "simd_id":{ + "type": "integer", + "description": "SIMD index." + }, + "pipe_id ":{ + "type": "integer", + "description": "Pipe index." + }, + "cu_or_wgp_id":{ + "type": "integer", + "description": "Index of compute unit or workgroup processer." + }, + "shader_array_id":{ + "type": "integer", + "description": "Shader array index." + }, + "shader_engine_id":{ + "type": "integer", + "description": "Shader engine index." + }, + "workgroup_id":{ + "type": "integer", + "description": "Workgroup position in the 3D." + }, + "vm_id":{ + "type": "integer", + "description": "Virtual memory ID." + }, + "queue_id":{ + "type": "integer", + "description": "Queue id." + }, + "microengine_id":{ + "type": "integer", + "description": "ACE (microengine) index." + } + } + }, + "pc": { + "type": "object", + "description": "Encapsulates information about sampled PC.", + "properties": { + "code_object_id":{ + "type": "integer", + "description": "Code object id" + }, + "code_object_offset":{ + "type": "integer", + "description": "Offset within the object if the latter is known. Otherwise, virtual address of the PC." + } + } + }, + "exec_mask":{ + "type": "integer", + "description": "Execution mask indicating active SIMD lanes of sampled wave." + }, + "timestamp":{ + "type": "integer", + "description": "Timestamp." + }, + "dispatch_id":{ + "type": "integer", + "description": "Dispatch id." + }, + "correlation_id": { + "type": "object", + "description": "Correlation ID information.", + "properties": { + "internal": { + "type": "integer", + "description": "Internal correlation ID." + }, + "external": { + "type": "integer", + "description": "External correlation ID." + } + } + }, + "rocprofiler_dim3_t": { + "type": "object", + "description": " Position of the workgroup in 3D grid.", + "properties": { + "x": { + "type": "integer", + "description": "Dimension x." + }, + "y": { + "type": "integer", + "description": "Dimension y." + }, + "z": { + "type": "integer", + "description": "Dimension z." + } + } + }, + "wave_in_group": { + "type": "integer", + "description": "Wave position within the workgroup (0-31)." + } + } + } + } + } }, "buffer_records": { diff --git a/projects/rocprofiler-sdk/source/docs/rocprofv3_input_schema.json b/projects/rocprofiler-sdk/source/docs/rocprofv3_input_schema.json index 08f999a639..63eb21ef79 100644 --- a/projects/rocprofiler-sdk/source/docs/rocprofv3_input_schema.json +++ b/projects/rocprofiler-sdk/source/docs/rocprofv3_input_schema.json @@ -144,7 +144,19 @@ "preload":{ "type": "array", "description": "Libraries to prepend to LD_PRELOAD (usually for sanitizers)" - } + }, + "pc_sampling_unit": { + "type": "string", + "description": "pc sampling unit" + }, + "pc_sampling_method": { + "type": "string", + "description": "pc sampling method" + }, + "pc_sampling_interval": { + "type": "integer", + "description": "pc sampling interval" + } } } } diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization.hpp b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization.hpp index ff62fc7d9d..5c1e053b8c 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization.hpp +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization.hpp @@ -735,6 +735,44 @@ save(ArchiveT& ar, rocprofiler_agent_cache_t data) ROCP_SDK_SAVE_DATA_FIELD(latency); ROCP_SDK_SAVE_DATA_FIELD(type); } +template +void +save(ArchiveT& ar, rocprofiler_pc_t data) +{ + ROCP_SDK_SAVE_DATA_FIELD(code_object_id); + ROCP_SDK_SAVE_DATA_FIELD(code_object_offset); +} + +template +void +save(ArchiveT& ar, rocprofiler_pc_sampling_hw_id_v0_t data) +{ + ROCP_SDK_SAVE_DATA_BITFIELD("chiplet", chiplet); + ROCP_SDK_SAVE_DATA_BITFIELD("wave_id", wave_id); + ROCP_SDK_SAVE_DATA_BITFIELD("simd_id", simd_id); + ROCP_SDK_SAVE_DATA_BITFIELD("pipe_id", pipe_id); + ROCP_SDK_SAVE_DATA_BITFIELD("cu_or_wgp_id", cu_or_wgp_id); + ROCP_SDK_SAVE_DATA_BITFIELD("shader_array_id", shader_array_id); + ROCP_SDK_SAVE_DATA_BITFIELD("shader_engine_id", shader_engine_id); + ROCP_SDK_SAVE_DATA_BITFIELD("workgroup_id ", workgroup_id); + ROCP_SDK_SAVE_DATA_BITFIELD("vm_id", vm_id); + ROCP_SDK_SAVE_DATA_BITFIELD("queue_id", queue_id); + ROCP_SDK_SAVE_DATA_BITFIELD("microengine_id", microengine_id); +} + +template +void +save(ArchiveT& ar, rocprofiler_pc_sampling_record_host_trap_v0_t data) +{ + ROCP_SDK_SAVE_DATA_FIELD(hw_id); + ROCP_SDK_SAVE_DATA_FIELD(pc); + ROCP_SDK_SAVE_DATA_FIELD(exec_mask); + ROCP_SDK_SAVE_DATA_FIELD(timestamp); + ROCP_SDK_SAVE_DATA_FIELD(dispatch_id); + ROCP_SDK_SAVE_DATA_VALUE("corr_id", correlation_id); + ROCP_SDK_SAVE_DATA_VALUE("wrkgrp_id", workgroup_id); + ROCP_SDK_SAVE_DATA_BITFIELD("wave_in_grp", wave_in_group); +} template void diff --git a/projects/rocprofiler-sdk/source/lib/output/buffered_output.hpp b/projects/rocprofiler-sdk/source/lib/output/buffered_output.hpp index 992f40605b..27f943c774 100644 --- a/projects/rocprofiler-sdk/source/lib/output/buffered_output.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/buffered_output.hpp @@ -24,6 +24,7 @@ #include "counter_info.hpp" #include "generator.hpp" +#include "pc_sample_transform.hpp" #include "statistics.hpp" #include "tmp_file_buffer.hpp" @@ -159,5 +160,8 @@ using memory_allocation_buffered_output_t = using counter_records_buffered_output_t = ::rocprofiler::tool::buffered_output; +using pc_sampling_host_trap_buffered_output_t = + buffered_output; } // namespace tool } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/output/csv.hpp b/projects/rocprofiler-sdk/source/lib/output/csv.hpp index f66a2561f6..d3a7f7dd4b 100644 --- a/projects/rocprofiler-sdk/source/lib/output/csv.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/csv.hpp @@ -99,17 +99,18 @@ struct csv_encoder } }; -using api_csv_encoder = csv_encoder<7>; -using agent_info_csv_encoder = csv_encoder<53>; -using kernel_trace_csv_encoder = csv_encoder<18>; -using counter_collection_csv_encoder = csv_encoder<18>; -using memory_copy_csv_encoder = csv_encoder<7>; -using memory_allocation_csv_encoder = csv_encoder<8>; -using marker_csv_encoder = csv_encoder<7>; -using list_basic_metrics_csv_encoder = csv_encoder<5>; -using list_derived_metrics_csv_encoder = csv_encoder<5>; -using scratch_memory_encoder = csv_encoder<8>; -using stats_csv_encoder = csv_encoder<8>; +using api_csv_encoder = csv_encoder<7>; +using agent_info_csv_encoder = csv_encoder<53>; +using kernel_trace_csv_encoder = csv_encoder<18>; +using counter_collection_csv_encoder = csv_encoder<18>; +using memory_copy_csv_encoder = csv_encoder<7>; +using memory_allocation_csv_encoder = csv_encoder<8>; +using marker_csv_encoder = csv_encoder<7>; +using list_basic_metrics_csv_encoder = csv_encoder<5>; +using list_derived_metrics_csv_encoder = csv_encoder<5>; +using scratch_memory_encoder = csv_encoder<8>; +using stats_csv_encoder = csv_encoder<8>; +using pc_sampling_host_trap_csv_encoder = csv_encoder<6>; } // namespace csv } // namespace tool } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/output/domain_type.cpp b/projects/rocprofiler-sdk/source/lib/output/domain_type.cpp old mode 100755 new mode 100644 index 8d281a86f7..f476189150 --- a/projects/rocprofiler-sdk/source/lib/output/domain_type.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/domain_type.cpp @@ -57,6 +57,10 @@ DEFINE_BUFFER_TYPE_NAME(MEMORY_ALLOCATION, "memory_allocation", "memory_allocation_stats") DEFINE_BUFFER_TYPE_NAME(COUNTER_VALUES, "COUNTER_VALUES", "counter_values", "no_filename") +DEFINE_BUFFER_TYPE_NAME(PC_SAMPLING_HOST_TRAP, + "PC_SAMPLING_HOST_TRAP", + "pc_sampling_host_trap", + "pc_sampling_host_trap_stats") #undef DEFINE_BUFFER_TYPE_NAME diff --git a/projects/rocprofiler-sdk/source/lib/output/domain_type.hpp b/projects/rocprofiler-sdk/source/lib/output/domain_type.hpp index 8fc8ced370..28b41b376a 100644 --- a/projects/rocprofiler-sdk/source/lib/output/domain_type.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/domain_type.hpp @@ -36,6 +36,7 @@ enum class domain_type RCCL, MEMORY_ALLOCATION, COUNTER_VALUES, + PC_SAMPLING_HOST_TRAP, LAST, }; diff --git a/projects/rocprofiler-sdk/source/lib/output/generateCSV.cpp b/projects/rocprofiler-sdk/source/lib/output/generateCSV.cpp old mode 100755 new mode 100644 index a95965d338..f1ea6c64a2 --- a/projects/rocprofiler-sdk/source/lib/output/generateCSV.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateCSV.cpp @@ -717,6 +717,65 @@ generate_csv(const output_config& cfg, } } +void +generate_csv(const output_config& cfg, + const metadata& tool_metadata, + const generator& data, + const stats_entry_t& stats) +{ + if(data.empty()) return; + + if(cfg.stats && stats) + write_stats(get_stats_output_file(cfg, domain_type::PC_SAMPLING_HOST_TRAP), stats.entries); + + auto ofs = tool::csv_output_file{cfg, + domain_type::PC_SAMPLING_HOST_TRAP, + tool::csv::pc_sampling_host_trap_csv_encoder{}, + {"Sample_Timestamp", + "Exec_Mask", + "Dispatch_Id", + "Instruction", + "Instruction_Comment", + "Correlation_Id"}}; + for(auto ditr : data) + { + for(const auto& record : data.get(ditr)) + { + if(record.inst_index == -1) + { + auto row_ss = std::stringstream{}; + std::string inst_comment = + "Unrecognized code object id, physical virtual address of PC:" + + std::to_string(record.pc_sample_record.pc.code_object_offset); + rocprofiler::tool::csv::pc_sampling_host_trap_csv_encoder::write_row( + row_ss, + record.pc_sample_record.timestamp, + record.pc_sample_record.exec_mask, + record.pc_sample_record.dispatch_id, + "", + inst_comment, + record.pc_sample_record.correlation_id.internal); + + ofs << row_ss.str(); + } + else + { + auto row_ss = std::stringstream{}; + rocprofiler::tool::csv::pc_sampling_host_trap_csv_encoder::write_row( + row_ss, + record.pc_sample_record.timestamp, + record.pc_sample_record.exec_mask, + record.pc_sample_record.dispatch_id, + tool_metadata.get_instruction(record.inst_index), + tool_metadata.get_comment(record.inst_index), + record.pc_sample_record.correlation_id.internal); + + ofs << row_ss.str(); + } + } + } +} + void generate_csv(const output_config& cfg, const metadata& /*tool_metadata*/, diff --git a/projects/rocprofiler-sdk/source/lib/output/generateCSV.hpp b/projects/rocprofiler-sdk/source/lib/output/generateCSV.hpp index c38f2d42f7..9c03a6821a 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateCSV.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateCSV.hpp @@ -92,6 +92,11 @@ generate_csv(const output_config& const metadata& tool_metadata, const generator& data, const stats_entry_t& stats); +void +generate_csv(const output_config& cfg, + const metadata& tool_metadata, + const generator& data, + const stats_entry_t& stats); void generate_csv(const output_config& cfg, diff --git a/projects/rocprofiler-sdk/source/lib/output/generateJSON.cpp b/projects/rocprofiler-sdk/source/lib/output/generateJSON.cpp index e1dbac5b96..e5e20c5172 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateJSON.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateJSON.cpp @@ -124,7 +124,13 @@ write_json(json_output& json_ar, json_ar.startNode(); json_ar(cereal::make_nvp("callback_records", callback_name_info)); json_ar(cereal::make_nvp("buffer_records", buffer_name_info)); + json_ar( + cereal::make_nvp("pc_sample_instructions", tool_metadata.get_pc_sample_instructions())); + json_ar(cereal::make_nvp("pc_sample_comments", tool_metadata.get_pc_sample_comments())); json_ar(cereal::make_nvp("marker_api", marker_msg_data)); + json_ar( + cereal::make_nvp("pc_sample_instructions", tool_metadata.get_pc_sample_instructions())); + json_ar(cereal::make_nvp("pc_sample_comments", tool_metadata.get_pc_sample_comments())); { auto _extern_corr_id_strings = std::map{}; @@ -178,7 +184,8 @@ write_json(json_output& json_ar, generator marker_api_gen, generator scratch_memory_gen, generator rccl_api_gen, - generator memory_allocation_gen) + generator memory_allocation_gen, + generator pc_sampling_gen) { // summary @@ -219,6 +226,7 @@ write_json(json_output& json_ar, json_ar(cereal::make_nvp("memory_copy", memory_copy_gen)); json_ar(cereal::make_nvp("memory_allocation", memory_allocation_gen)); json_ar(cereal::make_nvp("scratch_memory", scratch_memory_gen)); + json_ar(cereal::make_nvp("pc_sample_host_trap", pc_sampling_gen)); json_ar.finishNode(); } } diff --git a/projects/rocprofiler-sdk/source/lib/output/generateJSON.hpp b/projects/rocprofiler-sdk/source/lib/output/generateJSON.hpp index 9effe0b7f8..93baa8c3fc 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateJSON.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateJSON.hpp @@ -93,6 +93,7 @@ write_json(json_output& json generator marker_api_gen, generator scratch_memory_gen, generator rccl_api_gen, - generator memory_allocation_gen); + generator memory_allocation_gen, + generator pc_sampling_gen); } // namespace tool } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/output/generateStats.cpp b/projects/rocprofiler-sdk/source/lib/output/generateStats.cpp index 50a7d338e6..a8ab73cb3f 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateStats.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateStats.cpp @@ -411,5 +411,13 @@ generate_stats(const output_config& cfg, if(cfg.stats_summary) generate_stats(cfg, _os, "SUMMARY", data_v, _indent); } + +stats_entry_t +generate_stats(const output_config& /* cfg*/, + const metadata& /*tool_metadata*/, + const generator& /*data*/) +{ + return stats_entry_t{}; +} } // namespace tool } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/output/generateStats.hpp b/projects/rocprofiler-sdk/source/lib/output/generateStats.hpp index 06f1597149..7a025a9192 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateStats.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateStats.hpp @@ -75,6 +75,10 @@ generate_stats(const output_config& cfg, const metadata& tool_metadata, const generator& data); +stats_entry_t +generate_stats(const output_config& cfg, + const metadata& tool_metadata, + const generator& data); void generate_stats(const output_config& cfg, const metadata& tool_metadata, diff --git a/projects/rocprofiler-sdk/source/lib/output/metadata.cpp b/projects/rocprofiler-sdk/source/lib/output/metadata.cpp index 4daefa9b48..01ed1ffb8f 100644 --- a/projects/rocprofiler-sdk/source/lib/output/metadata.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/metadata.cpp @@ -47,6 +47,20 @@ dimensions_info_callback(rocprofiler_counter_id_t /*id*/, dimensions_info->emplace_back(dim_info[j]); return ROCPROFILER_STATUS_SUCCESS; } + +rocprofiler_status_t +query_pc_sampling_configuration(const rocprofiler_pc_sampling_configuration_t* configs, + long unsigned int num_config, + void* user_data) +{ + auto* avail_configs = + static_cast*>(user_data); + for(size_t i = 0; i < num_config; i++) + { + avail_configs->emplace_back(configs[i]); + } + return ROCPROFILER_STATUS_SUCCESS; +} } // namespace kernel_symbol_info::kernel_symbol_info() @@ -78,7 +92,14 @@ metadata::metadata(inprocess) _gpu_agents.reserve(agents.size()); for(auto& itr : agents) { - if(itr.type == ROCPROFILER_AGENT_TYPE_GPU) _gpu_agents.emplace_back(&itr); + if(itr.type == ROCPROFILER_AGENT_TYPE_GPU) + { + _gpu_agents.emplace_back(&itr); + auto pc_configs = std::vector{}; + rocprofiler_query_pc_sampling_agent_configurations( + itr.id, query_pc_sampling_configuration, &pc_configs); + agent_pc_sample_config_info.emplace(itr.id, pc_configs); + } } // make sure they are sorted by node id @@ -112,6 +133,7 @@ void metadata::init(inprocess) void* user_data) { auto* data_v = static_cast(user_data); data_v->emplace(id, counter_info_vec_t{}); + for(size_t i = 0; i < num_counters; ++i) { auto _info = rocprofiler_counter_info_v0_t{}; @@ -260,6 +282,17 @@ metadata::get_gpu_agents() const return _data; } +pc_sample_config_vec_t +metadata::get_pc_sample_config_info(rocprofiler_agent_id_t _val) const +{ + auto _ret = pc_sample_config_vec_t{}; + auto pc_sample_config = agent_pc_sample_config_info.at(_val); + for(const auto& itr : pc_sample_config) + _ret.emplace_back(itr); + + return _ret; +} + counter_info_vec_t metadata::get_counter_info() const { @@ -417,5 +450,62 @@ metadata::get_string_entry(size_t key) const return ret; } + +int64_t +metadata::get_instruction_index(rocprofiler_pc_t record) +{ + inst_t ins; + ins.code_object_id = record.code_object_id; + ins.code_object_offset = record.code_object_offset; + auto itr = indexes.find(ins); + if(itr != indexes.end()) return itr->second; + auto idx = instruction_decoder.size(); + auto pc_instruction = decode_instruction(record); + instruction_decoder.emplace_back(pc_instruction->inst); + instruction_comment.emplace_back(pc_instruction->comment); + indexes.emplace(ins, idx); + return idx; +} + +void +metadata::add_decoder(rocprofiler_code_object_info_t* obj_data) +{ + if(obj_data->storage_type == ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_FILE) + { + decoder.wlock( + [](auto& _decoder, rocprofiler_code_object_info_t* obj_data_v) { + _decoder.addDecoder(obj_data_v->uri, + obj_data_v->code_object_id, + obj_data_v->load_delta, + obj_data_v->load_size); + }, + obj_data); + } + else + { + decoder.wlock( + [](auto& _decoder, rocprofiler_code_object_info_t* obj_data_v) { + _decoder.addDecoder( + // NOLINTBEGIN(performance-no-int-to-ptr) + reinterpret_cast(obj_data_v->memory_base), + // NOLINTEND(performance-no-int-to-ptr) + obj_data_v->memory_size, + obj_data_v->code_object_id, + obj_data_v->load_delta, + obj_data_v->load_size); + }, + obj_data); + } +} + +std::unique_ptr +metadata::decode_instruction(rocprofiler_pc_t pc) +{ + return decoder.wlock( + [](auto& _decoder, uint64_t id, uint64_t addr) { return _decoder.get(id, addr); }, + pc.code_object_id, + pc.code_object_offset); +} + } // namespace tool } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/output/metadata.hpp b/projects/rocprofiler-sdk/source/lib/output/metadata.hpp index e093a43449..a4405cde53 100644 --- a/projects/rocprofiler-sdk/source/lib/output/metadata.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/metadata.hpp @@ -25,6 +25,7 @@ #include "agent_info.hpp" #include "counter_info.hpp" #include "kernel_symbol_info.hpp" +#include "pc_sample_transform.hpp" #include "lib/common/container/small_vector.hpp" #include "lib/common/demangle.hpp" @@ -36,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -71,7 +73,8 @@ using marker_message_ordered_map_t = std::map; using string_entry_map_t = std::unordered_map>; using counter_dimension_vec_t = std::vector; using external_corr_id_set_t = std::unordered_set; - +using code_obj_decoder_t = rocprofiler::sdk::codeobj::disassembly::CodeobjAddressTranslate; +using instruction_t = rocprofiler::sdk::codeobj::disassembly::Instruction; template using synced_map = common::Synchronized; @@ -82,19 +85,21 @@ struct metadata struct inprocess {}; - pid_t process_id = 0; - uint64_t process_start_ns = 0; - uint64_t process_end_ns = 0; - agent_info_vec_t agents = {}; - agent_info_map_t agents_map = {}; - agent_counter_info_map_t agent_counter_info = {}; - sdk::buffer_name_info buffer_names = {}; - sdk::callback_name_info callback_names = {}; - synced_map code_objects = {}; - synced_map kernel_symbols = {}; - synced_map marker_messages = {}; - synced_map string_entries = {}; - synced_map external_corr_ids = {}; + pid_t process_id = 0; + uint64_t process_start_ns = 0; + uint64_t process_end_ns = 0; + agent_info_vec_t agents = {}; + agent_info_map_t agents_map = {}; + agent_counter_info_map_t agent_counter_info = {}; + agent_pc_sample_config_info_map_t agent_pc_sample_config_info = {}; + + sdk::buffer_name_info buffer_names = {}; + sdk::callback_name_info callback_names = {}; + synced_map code_objects = {}; + synced_map kernel_symbols = {}; + synced_map marker_messages = {}; + synced_map string_entries = {}; + synced_map external_corr_ids = {}; metadata() = default; metadata(inprocess); @@ -119,6 +124,13 @@ struct metadata agent_info_ptr_vec_t get_gpu_agents() const; counter_info_vec_t get_counter_info() const; counter_dimension_vec_t get_counter_dimension_info() const; + pc_sample_config_vec_t get_pc_sample_config_info(rocprofiler_agent_id_t _val) const; + std::vector get_pc_sample_instructions() const { return instruction_decoder; } + std::vector get_pc_sample_comments() const { return instruction_comment; } + std::string_view get_instruction(int64_t index) const { return instruction_decoder.at(index); } + std::string_view get_comment(int64_t index) const { return instruction_comment.at(index); } + int64_t get_instruction_index(rocprofiler_pc_t record); + void add_decoder(rocprofiler_code_object_info_t* obj_data_v); template Tp get_marker_messages(Tp&&); @@ -141,7 +153,13 @@ struct metadata const std::string* get_string_entry(size_t key) const; private: - bool inprocess_init = false; + bool inprocess_init = false; + std::unique_ptr decode_instruction(rocprofiler_pc_t pc); + synced_map decoder = {}; + // TODO: We may have to reserve the vector size based on map size + std::vector instruction_decoder = {}; + std::vector instruction_comment = {}; + std::map indexes = {}; }; template diff --git a/projects/rocprofiler-sdk/source/lib/output/pc_sample_transform.hpp b/projects/rocprofiler-sdk/source/lib/output/pc_sample_transform.hpp new file mode 100644 index 0000000000..ec8e9a62c9 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/output/pc_sample_transform.hpp @@ -0,0 +1,83 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include +#include +#include + +#include "lib/common/static_object.hpp" +#include "lib/common/synchronized.hpp" + +#include + +using pc_sample_config_vec_t = std::vector; +using agent_pc_sample_config_info_map_t = + std::unordered_map; + +namespace rocprofiler +{ +namespace tool +{ +struct inst_t +{ + uint64_t code_object_id; + uint64_t code_object_offset; + + bool operator==(const inst_t& inst) const + { + return this->code_object_id == inst.code_object_id && + this->code_object_offset == inst.code_object_offset; + } + + bool operator<(const inst_t& b) const + { + if(this->code_object_id == b.code_object_id) + return this->code_object_offset < b.code_object_offset; + return this->code_object_id < b.code_object_id; + }; +}; + +// TODO:: Check if we can template this structure +struct rocprofiler_tool_pc_sampling_host_trap_record_t +{ + rocprofiler_pc_sampling_record_host_trap_v0_t pc_sample_record; + int64_t inst_index; + + rocprofiler_tool_pc_sampling_host_trap_record_t( + rocprofiler_pc_sampling_record_host_trap_v0_t record, + int64_t index) + : pc_sample_record(record) + , inst_index(index) + {} + + template + void save(ArchiveT& ar) const + { + ar(cereal::make_nvp("record", pc_sample_record)); + ar(cereal::make_nvp("inst_index", inst_index)); + } +}; + +} // namespace tool +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/CMakeLists.txt index 229bd76e91..264eb310a2 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/CMakeLists.txt +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/CMakeLists.txt @@ -21,7 +21,9 @@ target_link_libraries( rocprofiler-sdk::rocprofiler-sdk-output-library rocprofiler-sdk::rocprofiler-sdk-cereal rocprofiler-sdk::rocprofiler-sdk-perfetto - rocprofiler-sdk::rocprofiler-sdk-otf2) + rocprofiler-sdk::rocprofiler-sdk-otf2 + rocprofiler-sdk::rocprofiler-sdk-dw + rocprofiler-sdk::rocprofiler-sdk-amd-comgr) set_target_properties( rocprofiler-sdk-tool diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp index d3740b1a4a..d401cb21f2 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp @@ -195,6 +195,22 @@ config::config() , counters{parse_counters(get_env("ROCPROF_COUNTERS", std::string{}))} { if(kernel_filter_include.empty()) kernel_filter_include = std::string{".*"}; + + std::unordered_map pc_sampling_unit_map = { + {"none", ROCPROFILER_PC_SAMPLING_UNIT_NONE}, + {"instructions", ROCPROFILER_PC_SAMPLING_UNIT_INSTRUCTIONS}, + {"cycles", ROCPROFILER_PC_SAMPLING_UNIT_CYCLES}, + {"time", ROCPROFILER_PC_SAMPLING_UNIT_TIME}}; + + std::unordered_map pc_sampling_method_map = + {{"none", ROCPROFILER_PC_SAMPLING_METHOD_NONE}, + {"stochastic", ROCPROFILER_PC_SAMPLING_METHOD_STOCHASTIC}, + {"host_trap", ROCPROFILER_PC_SAMPLING_METHOD_HOST_TRAP}}; + + pc_sampling_method_value = pc_sampling_method_map.at(pc_sampling_method); + if(pc_sampling_method_value == ROCPROFILER_PC_SAMPLING_METHOD_HOST_TRAP) + pc_sampling_host_trap = true; + pc_sampling_unit_value = pc_sampling_unit_map.at(pc_sampling_unit); } std::string diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp index f4034f7480..b7a2246c6b 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp @@ -75,29 +75,36 @@ struct config : output_config config& operator=(const config&) = default; config& operator=(config&&) noexcept = default; - bool demangle = get_env("ROCPROF_DEMANGLE_KERNELS", true); - bool truncate = get_env("ROCPROF_TRUNCATE_KERNELS", false); - bool kernel_trace = get_env("ROCPROF_KERNEL_TRACE", false); - bool hsa_core_api_trace = get_env("ROCPROF_HSA_CORE_API_TRACE", false); - bool hsa_amd_ext_api_trace = get_env("ROCPROF_HSA_AMD_EXT_API_TRACE", false); - bool hsa_image_ext_api_trace = get_env("ROCPROF_HSA_IMAGE_EXT_API_TRACE", false); - bool hsa_finalizer_ext_api_trace = get_env("ROCPROF_HSA_FINALIZER_EXT_API_TRACE", false); - bool marker_api_trace = get_env("ROCPROF_MARKER_API_TRACE", false); - bool memory_copy_trace = get_env("ROCPROF_MEMORY_COPY_TRACE", false); - bool memory_allocation_trace = get_env("ROCPROF_MEMORY_ALLOCATION_TRACE", false); - bool scratch_memory_trace = get_env("ROCPROF_SCRATCH_MEMORY_TRACE", false); - bool counter_collection = get_env("ROCPROF_COUNTER_COLLECTION", false); - bool hip_runtime_api_trace = get_env("ROCPROF_HIP_RUNTIME_API_TRACE", false); - bool hip_compiler_api_trace = get_env("ROCPROF_HIP_COMPILER_API_TRACE", false); - bool rccl_api_trace = get_env("ROCPROF_RCCL_API_TRACE", false); - bool list_metrics = get_env("ROCPROF_LIST_METRICS", false); - bool list_metrics_output_file = get_env("ROCPROF_OUTPUT_LIST_METRICS_FILE", false); + bool demangle = get_env("ROCPROF_DEMANGLE_KERNELS", true); + bool truncate = get_env("ROCPROF_TRUNCATE_KERNELS", false); + bool kernel_trace = get_env("ROCPROF_KERNEL_TRACE", false); + bool hsa_core_api_trace = get_env("ROCPROF_HSA_CORE_API_TRACE", false); + bool hsa_amd_ext_api_trace = get_env("ROCPROF_HSA_AMD_EXT_API_TRACE", false); + bool hsa_image_ext_api_trace = get_env("ROCPROF_HSA_IMAGE_EXT_API_TRACE", false); + bool hsa_finalizer_ext_api_trace = get_env("ROCPROF_HSA_FINALIZER_EXT_API_TRACE", false); + bool marker_api_trace = get_env("ROCPROF_MARKER_API_TRACE", false); + bool memory_copy_trace = get_env("ROCPROF_MEMORY_COPY_TRACE", false); + bool memory_allocation_trace = get_env("ROCPROF_MEMORY_ALLOCATION_TRACE", false); + bool scratch_memory_trace = get_env("ROCPROF_SCRATCH_MEMORY_TRACE", false); + bool counter_collection = get_env("ROCPROF_COUNTER_COLLECTION", false); + bool hip_runtime_api_trace = get_env("ROCPROF_HIP_RUNTIME_API_TRACE", false); + bool hip_compiler_api_trace = get_env("ROCPROF_HIP_COMPILER_API_TRACE", false); + bool rccl_api_trace = get_env("ROCPROF_RCCL_API_TRACE", false); + bool list_metrics = get_env("ROCPROF_LIST_METRICS", false); + bool list_metrics_output_file = get_env("ROCPROF_OUTPUT_LIST_METRICS_FILE", false); + bool pc_sampling_host_trap = false; + size_t pc_sampling_interval = get_env("ROCPROF_PC_SAMPLING_INTERVAL", 1); + rocprofiler_pc_sampling_method_t pc_sampling_method_value = ROCPROFILER_PC_SAMPLING_METHOD_NONE; + rocprofiler_pc_sampling_unit_t pc_sampling_unit_value = ROCPROFILER_PC_SAMPLING_UNIT_NONE; - int mpi_size = get_mpi_size(); - int mpi_rank = get_mpi_rank(); + std::string stats_summary_unit = get_env("ROCPROF_STATS_SUMMARY_UNITS", "nsec"); + int mpi_size = get_mpi_size(); + int mpi_rank = get_mpi_rank(); std::string kernel_filter_include = get_env("ROCPROF_KERNEL_FILTER_INCLUDE_REGEX", ".*"); std::string kernel_filter_exclude = get_env("ROCPROF_KERNEL_FILTER_EXCLUDE_REGEX", ""); + std::string pc_sampling_method = get_env("ROCPROF_PC_SAMPLING_METHOD", "none"); + std::string pc_sampling_unit = get_env("ROCPROF_PC_SAMPLING_UNIT", "none"); std::string extra_counters_contents = get_env("ROCPROF_EXTRA_COUNTERS_CONTENTS", ""); std::unordered_set kernel_filter_range = {}; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp old mode 100755 new mode 100644 index b97daeb899..dea1016baf --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp @@ -157,17 +157,19 @@ struct buffer_ids rocprofiler_buffer_id_t counter_collection = {}; rocprofiler_buffer_id_t scratch_memory = {}; rocprofiler_buffer_id_t rccl_api_trace = {}; + rocprofiler_buffer_id_t pc_sampling_host_trap = {}; auto as_array() const { - return std::array{hsa_api_trace, + return std::array{hsa_api_trace, hip_api_trace, kernel_trace, memory_copy_trace, memory_allocation_trace, counter_collection, scratch_memory, - rccl_api_trace}; + rccl_api_trace, + pc_sampling_host_trap}; } }; @@ -547,6 +549,10 @@ code_object_tracing_callback(rocprofiler_callback_tracing_record_t record, auto* obj_data = static_cast(record.payload); CHECK_NOTNULL(tool_metadata)->add_code_object(*obj_data); + if(tool::get_config().pc_sampling_host_trap) + { + CHECK_NOTNULL(tool_metadata)->add_decoder(obj_data); + } } else if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD) { @@ -801,6 +807,58 @@ get_device_counting_service(rocprofiler_agent_id_t agent_id) return profile; } +int64_t +get_instruction_index(rocprofiler_pc_t pc) +{ + if(pc.code_object_id == ROCPROFILER_CODE_OBJECT_ID_NONE) + return -1; + else + return CHECK_NOTNULL(tool_metadata)->get_instruction_index(pc); +} + +} // namespace + +void +rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /* context_id*/, + rocprofiler_buffer_id_t /* buffer_id*/, + rocprofiler_record_header_t** headers, + size_t num_headers, + void* /*data*/, + uint64_t /* drop_count*/) +{ + if(!headers) return; + + for(size_t i = 0; i < num_headers; i++) + { + auto* cur_header = headers[i]; + + if(cur_header == nullptr) + { + throw std::runtime_error{ + "rocprofiler provided a null pointer to header. this should never happen"}; + } + else if(cur_header->category == ROCPROFILER_BUFFER_CATEGORY_PC_SAMPLING) + { + if(cur_header->kind == ROCPROFILER_PC_SAMPLING_RECORD_HOST_TRAP_V0_SAMPLE) + { + auto* pc_sample = static_cast( + cur_header->payload); + + auto pc_sample_tool_record = + rocprofiler::tool::rocprofiler_tool_pc_sampling_host_trap_record_t( + *pc_sample, get_instruction_index(pc_sample->pc)); + + rocprofiler::tool::write_ring_buffer(pc_sample_tool_record, + domain_type::PC_SAMPLING_HOST_TRAP); + } + } + else + { + ROCP_FATAL << "unexpected rocprofiler_record_header_t category + kind"; + } + } +} + void dispatch_callback(rocprofiler_dispatch_counting_service_data_t dispatch_data, rocprofiler_profile_config_id_t* config, @@ -1018,6 +1076,26 @@ finalize_rocprofv3(std::string_view context) } } +bool +if_pc_sample_config_match(rocprofiler_agent_id_t agent_id, + rocprofiler_pc_sampling_method_t pc_sampling_method, + rocprofiler_pc_sampling_unit_t pc_sampling_unit, + uint64_t pc_sampling_interval) +{ + auto pc_sampling_config = CHECK_NOTNULL(tool_metadata)->get_pc_sample_config_info(agent_id); + if(!pc_sampling_config.empty()) + { + for(auto config : pc_sampling_config) + { + if(config.method == pc_sampling_method && config.unit == pc_sampling_unit && + config.min_interval <= pc_sampling_interval && + config.max_interval >= pc_sampling_interval) + return true; + } + } + return false; +} + int tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) { @@ -1277,6 +1355,44 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) "Could not configure external correlation id request service"); } + if(tool::get_config().pc_sampling_host_trap) + { + ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(), + buffer_size, + buffer_watermark, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + rocprofiler_pc_sampling_callback, + tool_data, + &get_buffers().pc_sampling_host_trap), + "buffer creation"); + bool config_match_found = false; + auto agent_ptr_vec = get_gpu_agents(); + for(auto& itr : agent_ptr_vec) + { + auto method = static_cast( + tool::get_config().pc_sampling_method_value); + auto unit = static_cast( + tool::get_config().pc_sampling_unit_value); + if(if_pc_sample_config_match( + itr->id, method, unit, tool::get_config().pc_sampling_interval)) + { + config_match_found = true; + int flags = 0; + ROCPROFILER_CALL(rocprofiler_configure_pc_sampling_service( + get_client_ctx(), + itr->id, + method, + unit, + tool::get_config().pc_sampling_interval, + get_buffers().pc_sampling_host_trap, + flags), + "configure PC sampling"); + } + } + if(!config_match_found) + ROCP_ERROR << "Given PC sampling configuration is not supported on any of the agents"; + } + for(auto itr : get_buffers().as_array()) { if(itr.handle > 0) @@ -1380,6 +1496,8 @@ tool_fini(void* /*tool_data*/) tool::memory_allocation_buffered_output_t{tool::get_config().memory_allocation_trace}; auto counters_records_output = tool::counter_records_buffered_output_t{tool::get_config().counter_collection}; + auto pc_sampling_host_trap_output = + tool::pc_sampling_host_trap_buffered_output_t{tool::get_config().pc_sampling_host_trap}; auto node_id_sort = [](const auto& lhs, const auto& rhs) { return lhs.node_id < rhs.node_id; }; @@ -1402,6 +1520,7 @@ tool_fini(void* /*tool_data*/) generate_output(rccl_output, contributions); generate_output(counters_output, contributions); generate_output(scratch_memory_output, contributions); + generate_output(pc_sampling_host_trap_output, contributions); if(tool::get_config().stats && tool::get_config().csv_output) { @@ -1426,7 +1545,8 @@ tool_fini(void* /*tool_data*/) marker_output.get_generator(), scratch_memory_output.get_generator(), rccl_output.get_generator(), - memory_allocation_output.get_generator()); + memory_allocation_output.get_generator(), + pc_sampling_host_trap_output.get_generator()); json_ar.finish_process(); tool::close_json(json_ar); @@ -1489,6 +1609,7 @@ tool_fini(void* /*tool_data*/) destroy_output(scratch_memory_output); destroy_output(rccl_output); destroy_output(counters_records_output); + destroy_output(pc_sampling_host_trap_output); if(destructors) { @@ -1502,7 +1623,6 @@ tool_fini(void* /*tool_data*/) __gcov_dump(); #endif } -} // namespace std::vector get_tool_counter_dimension_info() diff --git a/projects/rocprofiler-sdk/tests/bin/CMakeLists.txt b/projects/rocprofiler-sdk/tests/bin/CMakeLists.txt index 59ed2ebb59..f24d95fcb3 100644 --- a/projects/rocprofiler-sdk/tests/bin/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/bin/CMakeLists.txt @@ -25,3 +25,4 @@ add_subdirectory(page-migration) add_subdirectory(hsa-queue-dependency) add_subdirectory(hip-graph) add_subdirectory(hsa-memory-allocation) +add_subdirectory(pc-sampling) diff --git a/projects/rocprofiler-sdk/tests/bin/pc-sampling/CMakeLists.txt b/projects/rocprofiler-sdk/tests/bin/pc-sampling/CMakeLists.txt new file mode 100644 index 0000000000..505c80cfc5 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/bin/pc-sampling/CMakeLists.txt @@ -0,0 +1,11 @@ +# +# Integration test applications +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +project(rocprofiler-tests-bin LANGUAGES C CXX) + +set(CMAKE_BUILD_RPATH "\$ORIGIN:\$ORIGIN/../lib") + +# applications used by integration tests which DO NOT link to rocprofiler-sdk-roctx +add_subdirectory(exec-mask-manipulation) diff --git a/projects/rocprofiler-sdk/tests/bin/pc-sampling/exec-mask-manipulation/CMakeLists.txt b/projects/rocprofiler-sdk/tests/bin/pc-sampling/exec-mask-manipulation/CMakeLists.txt new file mode 100644 index 0000000000..0a97752330 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/bin/pc-sampling/exec-mask-manipulation/CMakeLists.txt @@ -0,0 +1,43 @@ +# +# +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +if(NOT CMAKE_HIP_COMPILER) + find_program( + amdclangpp_EXECUTABLE + NAMES amdclang++ + HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATH_SUFFIXES bin llvm/bin NO_CACHE) + mark_as_advanced(amdclangpp_EXECUTABLE) + + if(amdclangpp_EXECUTABLE) + set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}") + endif() +endif() + +project(rocprofiler-tests-bin-transpose LANGUAGES CXX HIP) + +foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) + if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "") + set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}") + endif() +endforeach() + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_HIP_STANDARD 17) +set(CMAKE_HIP_EXTENSIONS OFF) +set(CMAKE_HIP_STANDARD_REQUIRED ON) + +set_source_files_properties(exec_mask_manipulation.cpp PROPERTIES LANGUAGE HIP) +add_executable(exec-mask-manipulation) +target_sources(exec-mask-manipulation PRIVATE exec_mask_manipulation.cpp) +# debug symbols required for PC sampling decoding validation +target_compile_options(exec-mask-manipulation PRIVATE -W -Wall -Wextra -Wpedantic + -Wshadow -Werror -g) + +find_package(Threads REQUIRED) +target_link_libraries(exec-mask-manipulation PRIVATE Threads::Threads) diff --git a/projects/rocprofiler-sdk/tests/bin/pc-sampling/exec-mask-manipulation/exec_mask_manipulation.cpp b/projects/rocprofiler-sdk/tests/bin/pc-sampling/exec-mask-manipulation/exec_mask_manipulation.cpp new file mode 100644 index 0000000000..19d8934d1b --- /dev/null +++ b/projects/rocprofiler-sdk/tests/bin/pc-sampling/exec-mask-manipulation/exec_mask_manipulation.cpp @@ -0,0 +1,537 @@ +/* +Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include + +#include + +#define ITER_NUM 16 * 1024 +#define BLOCK_SIZE 1024 + +#define HIP_API_CALL(CALL) \ + { \ + hipError_t error_ = (CALL); \ + if(error_ != hipSuccess) \ + { \ + auto _hip_api_print_lk = auto_lock_t{print_lock}; \ + fprintf(stderr, \ + "%s:%d :: HIP error : %s\n", \ + __FILE__, \ + __LINE__, \ + hipGetErrorString(error_)); \ + throw std::runtime_error("hip_api_call"); \ + } \ + } + +namespace +{ +using auto_lock_t = std::unique_lock; +auto print_lock = std::mutex{}; + +void +check_hip_error(void); +} // namespace + +// ====================================================== +__global__ void +kernel1(const int c) +{ + int a = 0; +#pragma nounroll + for(int i = 0; i < ITER_NUM; i++) + { + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c)); + } +} + +__global__ void +kernel2(const int c) +{ + int a = 0; +#pragma nounroll + for(int i = 0; i < ITER_NUM; i++) + { + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c)); + } +} + +__global__ void +kernel3(const float c) +{ + double a = threadIdx.x; + float i = 0; + float d = 0; + float e = 0; + int tid_even = threadIdx.x % 2; + for(int j = 0; j < ITER_NUM; j++) + { + if(tid_even == 0) + { + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c)); + } + else + { + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e)); + } + } +} + +// ====================================================== + +void +run_kernel() +{ + for(int i = 1; i <= 64; i++) + { + if(i % 2 == 1) + kernel1<<>>(i); + else + kernel2<<>>(i); + + check_hip_error(); + HIP_API_CALL(hipDeviceSynchronize()); + } + + float arg = 0; + kernel3<<>>(arg); + check_hip_error(); + HIP_API_CALL(hipDeviceSynchronize()); +} + +int +main() +{ + run_kernel(); + return 0; +} + +namespace +{ +void +check_hip_error(void) +{ + hipError_t err = hipGetLastError(); + if(err != hipSuccess) + { + auto_lock_t _lk{print_lock}; + std::cerr << "Error: " << hipGetErrorString(err) << std::endl; + throw std::runtime_error("hip_api_call"); + } +} +} // namespace diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/CMakeLists.txt b/projects/rocprofiler-sdk/tests/rocprofv3/CMakeLists.txt index dac9c12e1f..8ed9eb62f9 100644 --- a/projects/rocprofiler-sdk/tests/rocprofv3/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/rocprofv3/CMakeLists.txt @@ -34,3 +34,4 @@ add_subdirectory(aborted-app) add_subdirectory(summary) add_subdirectory(roctracer-roctx) add_subdirectory(scratch-memory) +add_subdirectory(pc-sampling) diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/CMakeLists.txt b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/CMakeLists.txt new file mode 100644 index 0000000000..bd893c3c83 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/CMakeLists.txt @@ -0,0 +1,5 @@ +# +# PC sampling tests +# + +add_subdirectory(host-trap) diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/CMakeLists.txt b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/CMakeLists.txt new file mode 100644 index 0000000000..87d509b8f9 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/CMakeLists.txt @@ -0,0 +1,5 @@ +# +# PC sampling tests +# + +add_subdirectory(exec-mask-manipulation) diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/CMakeLists.txt b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/CMakeLists.txt new file mode 100644 index 0000000000..587ac89a71 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/CMakeLists.txt @@ -0,0 +1,150 @@ +# +# rocprofv3 tool test +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +project( + rocprofiler-tests-pc-sampling + LANGUAGES CXX + VERSION 0.0.0) + +find_package(rocprofiler-sdk REQUIRED) + +rocprofiler_configure_pytest_files(CONFIG pytest.ini COPY validate.py conftest.py + input.json input.yml) + +add_test( + NAME rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-cmd-execute + COMMAND + $ --pc-sampling-unit time + --pc-sampling-method host_trap --pc-sampling-interval 1 -d + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_cmd_input -o out --output-format csv json + -- $) + +string(REPLACE "LD_PRELOAD=" "ROCPROF_PRELOAD=" PRELOAD_ENV + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}") + +set(cc-env-pc-sampling "${PRELOAD_ENV}") + +set_tests_properties( + rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-cmd-execute + PROPERTIES TIMEOUT + 45 + LABELS + "integration-tests;pc-sampling" + ENVIRONMENT + "${cc-env-pc-sampling}" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + SKIP_REGULAR_EXPRESSION + "PC sampling unavailable") + +add_test( + NAME rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-json-execute + COMMAND + $ -i + ${CMAKE_CURRENT_BINARY_DIR}/input.json -d + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_json_input -- + $) + +set_tests_properties( + rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-json-execute + PROPERTIES TIMEOUT + 45 + LABELS + "integration-tests;pc-sampling" + ENVIRONMENT + "${cc-env-pc-sampling}" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + SKIP_REGULAR_EXPRESSION + "PC sampling unavailable") + +add_test( + NAME rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-yaml-execute + COMMAND + $ -i + ${CMAKE_CURRENT_BINARY_DIR}/input.yml -d + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_yaml_input -o out --output-format csv + json -- $) + +set_tests_properties( + rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-yaml-execute + PROPERTIES TIMEOUT + 45 + LABELS + "integration-tests;pc-sampling" + ENVIRONMENT + "${cc-env-pc-sampling}" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + SKIP_REGULAR_EXPRESSION + "PC sampling unavailable") + +# ========================= Validation tests + +add_test( + NAME rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-cmd-validate + COMMAND + ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py -k + test_validate_pc_sampling_exec_mask_manipulation_ --input-csv + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_cmd_input/out_pc_sampling_host_trap.csv + --input-json ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_cmd_input/out_results.json + --all-sampled False) + +set_tests_properties( + rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-cmd-validate + PROPERTIES TIMEOUT + 60 + LABELS + "integration-tests;pc-sampling" + DEPENDS + "rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-cmd-execute" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + SKIP_REGULAR_EXPRESSION + "PC sampling unavailable") + +add_test( + NAME rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-json-validate + COMMAND + ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py -k + test_validate_pc_sampling_exec_mask_manipulation_ --input-csv + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_json_input/out_pc_sampling_host_trap.csv + --input-json ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_json_input/out_results.json + --all-sampled False) + +set_tests_properties( + rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-json-validate + PROPERTIES TIMEOUT + 60 + LABELS + "integration-tests;pc-sampling" + DEPENDS + "rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-json-execute" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + SKIP_REGULAR_EXPRESSION + "PC sampling unavailable") + +add_test( + NAME rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-yaml-validate + COMMAND + ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py -k + test_validate_pc_sampling_exec_mask_manipulation_ --input-csv + ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_yaml_input/out_pc_sampling_host_trap.csv + --input-json ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_yaml_input/out_results.json + --all-sampled False) + +set_tests_properties( + rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-yaml-validate + PROPERTIES TIMEOUT + 60 + LABELS + "integration-tests;pc-sampling" + DEPENDS + "rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-yaml-execute" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + SKIP_REGULAR_EXPRESSION + "PC sampling unavailable") diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/conftest.py b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/conftest.py new file mode 100644 index 0000000000..5e182db9cf --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/conftest.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 + +import json +import os +import pytest +import pandas as pd + +from rocprofiler_sdk.pytest_utils.dotdict import dotdict +from rocprofiler_sdk.pytest_utils import collapse_dict_list + + +def pytest_addoption(parser): + parser.addoption( + "--input-csv", + action="store", + help="Path to CSV file.", + ) + + parser.addoption( + "--input-json", + action="store", + help="Path to CSV file.", + ) + + parser.addoption( + "--all-sampled", + action="store", + help="All SW and HW units must be sampled.", + ) + + +@pytest.fixture +def input_csv(request): + filename = request.config.getoption("--input-csv") + if not os.path.isfile(filename): + # The CSV file is not generated, because the dependency test + # responsible to generate this file was skipped or failed. + # Thus emit the message to skip this test as well. + print("PC sampling unavailable") + else: + with open(filename, "r") as inp: + return pd.read_csv( + inp, + na_filter=False, # parse empty fields as "" + keep_default_na=False, # parse empty fields as "" + dtype={ + "Exec_Mask": "uint64", + "Instruction": str, + "Instruction_Comment": str, + }, + ) + + +@pytest.fixture +def input_json(request): + filename = request.config.getoption("--input-json") + with open(filename, "r") as inp: + # Significant overhead of 5-6secs observed when feeding + # data into the dotdict. + # Using plain python dict instead + return collapse_dict_list(json.load(inp)) + + +@pytest.fixture +def all_sampled(request): + _all_sampled_str = request.config.getoption("--all-sampled") + return _all_sampled_str == "True" diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/input.json b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/input.json new file mode 100644 index 0000000000..80bf53eb37 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/input.json @@ -0,0 +1,14 @@ +{ + "jobs": [ + { + "pc_sampling_unit": "time", + "pc_sampling_method": "host_trap", + "pc_sampling_interval": 1, + "output_file": "out", + "output_format": [ + "csv", + "json" + ] + } + ] +} diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/input.yml b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/input.yml new file mode 100644 index 0000000000..692e2ed67a --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/input.yml @@ -0,0 +1,4 @@ +jobs: + - pc_sampling_unit: "time" + pc_sampling_method: "host_trap" + pc_sampling_interval: 1 diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/pytest.ini b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/pytest.ini new file mode 100644 index 0000000000..5e1e1c14a0 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/pytest.ini @@ -0,0 +1,5 @@ + +[pytest] +addopts = --durations=20 -rA -s -vv +testpaths = validate.py +pythonpath = @ROCPROFILER_SDK_TESTS_BINARY_DIR@/pytest-packages diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/validate.py b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/validate.py new file mode 100644 index 0000000000..7ccd72d98e --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/pc-sampling/host-trap/exec-mask-manipulation/validate.py @@ -0,0 +1,376 @@ +#!/usr/bin/env python3 + +import itertools +import sys +import pytest +import numpy as np +import pandas as pd + + +# =========================== Validating CSV output + + +# Keep this in case we decide to revert workgroup_id information +def validate_workgoup_id_x_y_z(df, max_x, max_y, max_z): + assert (df["Workgroup_Size_X"].astype(int) >= 0).all() + assert (df["Workgroup_Size_X"].astype(int) <= max_x).all() + + assert (df["Workgroup_Size_Y"].astype(int) >= 0).all() + assert (df["Workgroup_Size_Y"].astype(int) <= max_y).all() + + assert (df["Workgroup_Size_Z"].astype(int) >= 0).all() + assert (df["Workgroup_Size_Z"].astype(int) <= max_z).all() + + +# Keep this in case we decide to revert wave_id information +def validate_wave_id(df, max_wave_id): + assert (df["Wave_Id"].astype(int) <= max_wave_id).all() + + +# Keep this in case we decide to revert wave_id information +def validate_chiplet(df, max_chiplet): + assert (df["Chiplet"].astype(int) <= max_chiplet).all() + + +def validate_instruction_decoding( + df, + inst_str, + exec_mask_uint64: np.uint64 = None, + source_code_lines_range: (int, int) = None, + all_source_lines_samples=False, +): + # Make a copy, so that we don't work (modify) a view. + df_inst = df[df["Instruction"].apply(lambda inst: inst.startswith(inst_str))].copy() + + assert not df_inst.empty + # assert the exec mask if requested + if exec_mask_uint64 is not None: + assert (df_inst["Exec_Mask"].astype(np.uint64) == exec_mask_uint64).all() + + # assert whether the samples source code lines belongs to the provided range + if source_code_lines_range is not None: + start_range, end_range = source_code_lines_range + # The instruction comment is isually in the following format: /path/to/source/file.cpp:line_num + df_inst["source_line_num"] = df_inst["Instruction_Comment"].apply( + lambda source_line: int(source_line.split(":")[-1]) + ) + assert (df_inst["source_line_num"] >= start_range).all() + assert (df_inst["source_line_num"] <= end_range).all() + # if requested, check if all lines from the range are sampled + if all_source_lines_samples: + assert len(df_inst["source_line_num"].unique()) == ( + end_range - start_range + 1 + ) + + +def validate_instruction_comment(df): + # Instruction comment must always be present, since the testing application + # is built with debug symbols. + assert ( + (df["Instruction_Comment"] != "") & (df["Instruction_Comment"] != "nullptr") + ).all() + + +def validate_instruction_correlation_id_relation(df): + # Samples with no decoded instructions originates from either + # blit kernels or self modifying code. The correlation id for this + # type of samples should alway be zero. + # Thus, Correlation_Id is 0 `iff`` instruction is not decoded. + + # The previous statement has two implications. + # Implication 1: If the instruction is not decoded, then correlation id is 0. + samples_no_instruction_df = df[ + (df["Instruction"] == "") | (df["Instruction"] == "nullptr") + ] + assert (samples_no_instruction_df["Correlation_Id"] == 0).all() + + # Implication 2: If the correlation id is 0, then the instruction is not decoded. + samples_cid_zero_df = df[df["Correlation_Id"] == 0] + assert ( + (samples_cid_zero_df["Instruction"] == "") + | (samples_cid_zero_df["Instruction"] == "nullptr") + ).all() + + assert len(samples_no_instruction_df) == len(samples_cid_zero_df) + + # Since we're not enabling any kind of API tracing, + # internal correlation id should match the dispatch id + assert all(df["Correlation_Id"] == df["Dispatch_Id"]) + + +def validate_exec_mask_based_on_correlation_id(df): + # The function assumes that each kernel launches 1024 blocks. + # Each block contains number of threads that matches correlation ID of the kernel. + # The exec mask of a sample should contain number of ones equal to + # the correlation ID of the kernel during which execution the sample was generated. + df["active_SIMD_threads"] = df["Exec_Mask"].apply( + lambda exec_mask: bin(exec_mask).count("1") + ) + assert (df["active_SIMD_threads"] == df["Correlation_Id"]).all() + + # TODO: Comment out the following code if it causes spurious fails. + # The more conservative constraint based on the experience follows. + # The exec mask of sampled instructions of the kernels respect the following pattern: + # cid -> exec + # 1 -> 0b1 + # 2 -> 0b11 + # 3 -> 0b111 + # ... + # 64 -> 0xffffffffffffffff + + df["Exec_Mask2"] = ( + df["Correlation_Id"].astype(int).apply(lambda x: int("0b" + (x * "1"), 2)) + ) + + # TODO: exec should be in hex and that will ease the comparison + assert (df["Exec_Mask"].astype(np.uint64) == df["Exec_Mask2"].astype(np.uint64)).all() + + +def exec_mask_manipulation_validate_csv(df, all_sampled=False): + assert not df.empty + + validate_instruction_comment(df) + validate_instruction_correlation_id_relation(df) + + # Validate samples with non-zero correlation IDs (and with decoded instructions) + samples_cid_non_zero_df = df[df["Correlation_Id"] != 0] + + # exactly 65 kernels and 65 correlation id + assert (samples_cid_non_zero_df["Correlation_Id"].astype(int) >= 1).all() + assert (samples_cid_non_zero_df["Correlation_Id"].astype(int) <= 65).all() + if all_sampled: + # all correlation IDs must be sampled + assert len(samples_cid_non_zero_df["Correlation_Id"].astype(int).unique()) == 65 + + first_64_kernels_df = samples_cid_non_zero_df[ + samples_cid_non_zero_df["Correlation_Id"] <= 64 + ] + + # Make a copy, so that we don't work (modify) a view. + validate_exec_mask_based_on_correlation_id(first_64_kernels_df.copy()) + + # validate the last kernel + kernel_65_df = df[df["Correlation_Id"] == 65] + + # assert that v_rcp instructions are properly decoded + # the v_rcp is executed by even SIMD threads + validate_instruction_decoding( + kernel_65_df, + "v_rcp_f64", + exec_mask_uint64=np.uint64(int("5555555555555555", 16)), + source_code_lines_range=(288, 387), + all_source_lines_samples=all_sampled, + ) + + # assert that v_fmac_f32 instructions are properly decoded + # the v_fmac_f32 is executed by odd SIMD threads + validate_instruction_decoding( + kernel_65_df, + "v_fmac_f32", + exec_mask_uint64=np.uint64(int("AAAAAAAAAAAAAAAA", 16)), + source_code_lines_range=(391, 490), + all_source_lines_samples=all_sampled, + ) + + +def test_validate_pc_sampling_exec_mask_manipulation_csv( + input_csv: pd.DataFrame, all_sampled: bool +): + exec_mask_manipulation_validate_csv(input_csv, all_sampled=all_sampled) + + +# ========================= Validating JSON output + + +def validate_json_exec_mask_manipulation(data_json, all_sampled=False): + # Although functional programming might look more elegant, + # I was trying to avoid multiple iteration over the list of samples. + # Thus, I decided to use procedural programming instead. + # Although, it would be more elegant to wrap some of the checks in dedicated functions, + # I noticed that it can introduce significant overhead, so I decided to inline those checks. + + # the function assume homogenous system + agents = data_json["agents"] + gpu_agents = list(filter(lambda agent: agent["type"] == 2, agents)) + # There should be at least one GPU agent + assert len(gpu_agents) > 0 + first_gpu_agent = gpu_agents[0] + num_xcc = first_gpu_agent["num_xcc"] + max_waves_per_simd = first_gpu_agent["max_waves_per_simd"] + simd_per_cu = first_gpu_agent["simd_per_cu"] + + instructions = data_json["strings"]["pc_sample_instructions"] + comments = data_json["strings"]["pc_sample_comments"] + + # execution mask where even SIMD lanes are active + # correspond to the v_rcp_f64 instructions of the last kernel + even_simds_active_exec_mask = np.uint64(int("5555555555555555", 16)) + # start and end source code lines of the v_rcp_f64 instructions of the last kernel + v_rcp_f64_start_line_num, v_rcp_f64_end_line_num = 288, 387 + # execution mask where even SIMD lanes are active + # correspond to the v_rcp_f64 instructions of the last kernel + odd_simds_active_exec_mask = np.uint64(int("AAAAAAAAAAAAAAAA", 16)) + # start and end source code lines of the v_fmac_f32 0 instructions of the last kernel + v_fmac_f32_start_line_num, v_fmac_f32_end_line_num = 391, 490 + + # sampled wave_ids of the last kernel + kernel65_sampled_wave_in_grp = set() + # sampled source lines of the last kernel matching v_rcp_f64 instructions + kernel65_v_rcp_64_sampled_source_line_set = set() + # sampled source lines of the last kernel matching v_rcp_f64 instructions + kernel65_v_fmac_f32_sampled_source_line_set = set() + # sampled correlation IDs + sampled_cids_set = set() + # pairs of sampled SIMD ids and waveslot IDs + sampled_simd_waveslots_pairs = set() + # sampled chiplets + sampled_chiplets = set() + # sample VMIDs + sampled_vmids = set() + + for sample in data_json["buffer_records"]["pc_sample_host_trap"]: + record = sample["record"] + cid = record["corr_id"]["internal"] + + # pull information from hw_id + hw_id = record["hw_id"] + sampled_chiplets.add(hw_id["chiplet"]) + sampled_simd_waveslots_pairs.add((hw_id["simd_id"], hw_id["wave_id"])) + sampled_vmids.add(hw_id["vm_id"]) + + # Checks specific for all samples + + # cids must be non-negative numbers + assert cid >= 0 + + inst_index = sample["inst_index"] + + # Since we're not enabling any kind of API tracing, the internal correlation id should + # be equal to the dispatch_id + assert cid == record["dispatch_id"] + + if cid == 0: + # Samples originates either from a blit kernel or self-modifying code. + # Thus, code object is uknown, as well as the instruction. + assert record["pc"]["code_object_id"] == 0 + assert inst_index == -1 + else: + # Update set of sampled cids + sampled_cids_set.add(cid) + + # All samples with non-zero correlation ID should pass the following checks + # code object is know, so as the instruction + assert record["pc"]["code_object_id"] != 0 + assert inst_index != -1 + + wgid = record["wrkgrp_id"] + # check corrdinates of the workgroup + assert wgid["x"] >= 0 and wgid["x"] <= 1023 + assert wgid["y"] == 0 + assert wgid["z"] == 0 + + wave_in_grp = record["wave_in_grp"] + exec_mask = record["exec_mask"] + + if cid < 65: + # checks specific for samples from first 64 kernels + assert wave_in_grp == 0 + # inline if possible + # validate_json_exec_mask_based_on_cid(sample.record) + + # The function assumes that each kernel launches 1024 blocks. + # Each block contains number of threads that matches correlation ID of the kernel. + # The exec mask of a sample should contain number of ones equal to + # the correlation ID of the kernel during which execution the sample was generated. + assert bin(exec_mask).count("1") == cid + + # TODO: Comment out the following code if it causes spurious fails. + # The more conservative constraint based on the experience follows. + # The exec mask of sampled instructions of the kernels respect the following pattern: + # cid -> exec + # 1 -> 0b1 + # 2 -> 0b11 + # 3 -> 0b111 + # ... + # 64 -> 0xffffffffffffffff + exec_mask_str = "0b" + "1" * cid + assert np.uint64(exec_mask) == np.uint64(int(exec_mask_str, 2)) + else: + # No more that 65 cids + assert cid == 65 + # Monitor wave_in_group being sampled + kernel65_sampled_wave_in_grp.add(wave_in_grp) + # chekcs specific for samples from the last kernel + assert wave_in_grp >= 0 and wave_in_grp <= 3 + + # validate instruction decoding + inst = instructions[inst_index] + comm = comments[inst_index] + # The instruction comment is isually in the following format: + # /path/to/source/file.cpp:line_num + line_num = int(comm.split(":")[-1]) + if inst.startswith("v_rcp_f64"): + # even SIMD lanes active + assert np.uint64(exec_mask) == even_simds_active_exec_mask + assert ( + line_num >= v_rcp_f64_start_line_num + and line_num <= v_rcp_f64_end_line_num + ) + kernel65_v_rcp_64_sampled_source_line_set.add(line_num) + elif inst.startswith("v_fmac_f32"): + # odd SIMD lanes active + assert np.uint64(exec_mask) == odd_simds_active_exec_mask + assert ( + line_num >= v_fmac_f32_start_line_num + and line_num <= v_fmac_f32_end_line_num + ) + kernel65_v_fmac_f32_sampled_source_line_set.add(line_num) + + if all_sampled: + # All cids that belongs to the range [1, 65] should be samples + assert len(sampled_cids_set) == 65 + + # all wave_ids that belongs to the range [0, 3] should be sampled for the last kernel + assert len(kernel65_sampled_wave_in_grp) == 4 + + # all source lines matches v_rcp_f64 instructions of the last kernel should be sampled + assert len(kernel65_v_rcp_64_sampled_source_line_set) == ( + v_rcp_f64_end_line_num - v_rcp_f64_start_line_num + 1 + ) + # all source lines matches v_fmac_f32 instructions of the last kernel should be sampled + assert len(kernel65_v_fmac_f32_sampled_source_line_set) == ( + v_fmac_f32_end_line_num - v_fmac_f32_start_line_num + 1 + ) + + # all chiplets must be sampled + assert len(sampled_chiplets) == num_xcc + # all (simd ID, waveslot ID) pairs must be samples + assert len(sampled_simd_waveslots_pairs) == simd_per_cu * max_waves_per_simd + + # assert chiplet index + assert all(map(lambda chiplet: 0 <= chiplet < num_xcc, sampled_chiplets)) + # assert (SIMD ID, waveslot ID) combinations + assert all( + map( + lambda simd_waveslot: (0 <= simd_waveslot[0] < simd_per_cu) + and (0 <= simd_waveslot[1] < max_waves_per_simd), + sampled_simd_waveslots_pairs, + ) + ) + # all samples should belong to the same VMID + assert len(sampled_vmids) == 1 + + +def test_validate_pc_sampling_exec_mask_manipulation_json( + input_json, input_csv: pd.DataFrame, all_sampled: bool +): + data = input_json["rocprofiler-sdk-tool"] + # The same amount of samples should be in both CSV and JSON files. + assert len(input_csv) == len(data["buffer_records"]["pc_sample_host_trap"]) + # # validating JSON output + validate_json_exec_mask_manipulation(data, all_sampled=all_sampled) + + +if __name__ == "__main__": + exit_code = pytest.main(["-x", __file__] + sys.argv[1:]) + sys.exit(exit_code)