rocprofv3: PC Sampling Support (#14)
* Adding tool pc sampling support
Fixing merge issue
tool support on SDKupdates
link amd-comgr
Sanitizer failure fix
fix format
Addressing review comments
misc fix
Adding dispatch id to the CSV output
AddingCHANGELOG
[ROCProfV3][PC Sampling] Initial ROCProfV3 PC sampling tests for JSON and CSV formats (#17)
ROCProfV3 initial tests for JSON and CSV output.
Simple kernels that simplify the verification of samples to instruction decoding
has been introduced.
removing option to enable pc sampling explicitly
Adding documentation
no pc-sampling option in tests anymore
Addressing review comments
Updating docs
an option for choosing whether all units must be sampled
try ignoring PC sampling tests (#36)
* run pc-sampling tests on MI2xx runners
* use v_fmac_f32 instead of s_nop 0 in tests
* fixing docs
[ROCm/rocprofiler-sdk commit: 50b185b9ac]
This commit is contained in:
کامیت شده توسط
GitHub
والد
5ec8560fab
کامیت
9cfbfb5060
@@ -109,6 +109,7 @@ Full documentation for ROCprofiler-SDK is available at [rocm.docs.amd.com/projec
|
||||
- Check to force tools to initialize context id with zero
|
||||
- Support to specify hardware counters for collection using rocprofv3 as `rocprofv3 --pmc [COUNTER [COUNTER ...]]`
|
||||
- Memory Allocation Tracing
|
||||
- PC sampling tool support with CSV and JSON output formats
|
||||
|
||||
### Changed
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ import os
|
||||
import sys
|
||||
import argparse
|
||||
import subprocess
|
||||
import numpy
|
||||
|
||||
|
||||
class dotdict(dict):
|
||||
@@ -167,6 +168,30 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins
|
||||
help="Collect tracing data for HIP API, HSA API, Marker (ROCTx) API, RCCL API, Memory operations (copies, scratch, and allocations), and Kernel dispatches.",
|
||||
)
|
||||
|
||||
pc_sampling_options = parser.add_argument_group("PC sampling options")
|
||||
|
||||
pc_sampling_options.add_argument(
|
||||
"--pc-sampling-unit",
|
||||
help="",
|
||||
default=None,
|
||||
type=str.lower,
|
||||
choices=("instructions", "cycles", "time"),
|
||||
)
|
||||
|
||||
pc_sampling_options.add_argument(
|
||||
"--pc-sampling-method",
|
||||
help="",
|
||||
default=None,
|
||||
type=str.lower,
|
||||
choices=("stochastic", "host_trap"),
|
||||
)
|
||||
|
||||
pc_sampling_options.add_argument(
|
||||
"--pc-sampling-interval",
|
||||
help="",
|
||||
default=None,
|
||||
type=numpy.uint64,
|
||||
)
|
||||
basic_tracing_options = parser.add_argument_group("Basic tracing options")
|
||||
|
||||
# Add the arguments
|
||||
@@ -904,6 +929,18 @@ def run(app_args, args, **kwargs):
|
||||
if args.log_level in ("info", "trace", "env"):
|
||||
log_config(app_env)
|
||||
|
||||
if args.pc_sampling_unit or args.pc_sampling_method or args.pc_sampling_method:
|
||||
if not (
|
||||
args.pc_sampling_unit and args.pc_sampling_method and args.pc_sampling_method
|
||||
):
|
||||
fatal_error("All three PC sampling configurations need to be set")
|
||||
|
||||
update_env("ROCPROFILER_PC_SAMPLING_BETA_ENABLED", "ON")
|
||||
update_env("ROCPROF_PC_SAMPLING_UNIT", args.pc_sampling_unit)
|
||||
update_env("ROCPROF_PC_SAMPLING_METHOD", args.pc_sampling_method)
|
||||
update_env("ROCPROF_PC_SAMPLING_INTERVAL", args.pc_sampling_interval)
|
||||
update_env("ROCPROF_ENABLE_PC_SAMPLING", args.pc_sampling)
|
||||
|
||||
if use_execv:
|
||||
# does not return
|
||||
os.execvpe(app_args[0], app_args, env=app_env)
|
||||
|
||||
@@ -162,6 +162,18 @@ Here is the sample of commonly used ``rocprofv3`` command-line options. Some opt
|
||||
- Perfetto shared memory size hint in KB. default: 64 KB
|
||||
- Extension
|
||||
|
||||
* - ``--pc-sampling-method``
|
||||
- Type of PC Sampling, currently only host trap method is supported
|
||||
- PC Sampling Configurations
|
||||
|
||||
* - ``--pc-sampling-unit``
|
||||
- The unit appropriate to the PC sampling type/method, currently only time unit is supported
|
||||
- PC Sampling Configurations
|
||||
|
||||
* - ``--pc-sampling-interval``
|
||||
- Frequency at which PC samples are generated
|
||||
- PC Sampling Configurations
|
||||
|
||||
To see exhaustive list of ``rocprofv3`` options, run:
|
||||
|
||||
.. code-block:: bash
|
||||
@@ -675,6 +687,9 @@ Properties
|
||||
trace.
|
||||
- **``preload``** *(array)*: Libraries to prepend to LD_PRELOAD
|
||||
(usually for sanitizers).
|
||||
- **``pc_sampling_unit``** *(string)*: pc sampling unit.
|
||||
- **``pc_sampling_method``** *(string)*: pc sampling method.
|
||||
- **``pc_sampling_interval``** *(integer)*: pc sampling interval.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
@@ -1039,6 +1054,14 @@ Properties
|
||||
- **`id`** *(integer, required)*: Dimension ID.
|
||||
- **`instance_size`** *(integer, required)*: Size of the instance.
|
||||
- **`name`** *(string, required)*: Name of the dimension.
|
||||
- **``pc_sample_instructions``** *(array)*: Array of decoded
|
||||
instructions matching sampled PCs from pc_sample_host_trap
|
||||
section.
|
||||
- **``pc_sample_comments``** *(array)*: Comments matching
|
||||
assembly instructions from pc_sample_instructions array. If
|
||||
debug symbols are available, comments provide instructions
|
||||
to source-line mapping. Otherwise, a comment is an empty
|
||||
string.
|
||||
- **`code_objects`** *(array, required)*: Code object records.
|
||||
- **Items** *(object)*
|
||||
- **`size`** *(integer, required)*: Size of the code object.
|
||||
@@ -1103,6 +1126,37 @@ Properties
|
||||
- **`arch_vgpr_count`** *(integer, required)*: Count of VGPRs.
|
||||
- **`sgpr_count`** *(integer, required)*: Count of SGPRs.
|
||||
- **`lds_block_size_v`** *(integer, required)*: Size of LDS block.
|
||||
- **``pc_sample_host_trap``** *(array)*: Host Trap PC Sampling records.
|
||||
- **Items** *(object)*
|
||||
- **``hw_id``** *(object)*: Describes hardware part on which sampled wave was running.
|
||||
- **``chiplet``** *(integer)*: Chiplet index.
|
||||
- **``wave_id``** *(integer)*: Wave slot index.
|
||||
- **``simd_id``** *(integer)*: SIMD index.
|
||||
- **``pipe_id``** *(integer)*: Pipe index.
|
||||
- **``cu_or_wgp_id``** *(integer)*: Index of compute unit or workgroup processer.
|
||||
- **``shader_array_id``** *(integer)*: Shader array index.
|
||||
- **``shader_engine_id``** *(integer)*: Shader engine
|
||||
index.
|
||||
- **``workgroup_id``** *(integer)*: Workgroup position in the 3D.
|
||||
- **``vm_id``** *(integer)*: Virtual memory ID.
|
||||
- **``queue_id``** *(integer)*: Queue id.
|
||||
- **``microengine_id``** *(integer)*: ACE
|
||||
(microengine) index.
|
||||
- **``pc``** *(object)*: Encapsulates information about
|
||||
sampled PC.
|
||||
- **``code_object_id``** *(integer)*: Code object id.
|
||||
- **``code_object_offset``** *(integer)*: Offset within the object if the latter is known. Otherwise, virtual address of the PC.
|
||||
- **``exec_mask``** *(integer)*: Execution mask indicating active SIMD lanes of sampled wave.
|
||||
- **``timestamp``** *(integer)*: Timestamp.
|
||||
- **``dispatch_id``** *(integer)*: Dispatch id.
|
||||
- **``correlation_id``** *(object)*: Correlation ID information.
|
||||
- **``internal``** *(integer)*: Internal correlation ID.
|
||||
- **``external``** *(integer)*: External correlation ID.
|
||||
- **``rocprofiler_dim3_t``** *(object)*: Position of the workgroup in 3D grid.
|
||||
- **``x``** *(integer)*: Dimension x.
|
||||
- **``y``** *(integer)*: Dimension y.
|
||||
- **``z``** *(integer)*: Dimension z.
|
||||
- **``wave_in_group``** *(integer)*: Wave position within the workgroup (0-31).
|
||||
- **`buffer_records`** *(object, required)*: Buffer record details.
|
||||
- **`kernel_dispatch`** *(array)*: Kernel dispatch records.
|
||||
- **Items** *(object)*
|
||||
|
||||
@@ -559,6 +559,14 @@
|
||||
"required": [
|
||||
"dimension_ids"
|
||||
]
|
||||
},
|
||||
"pc_sample_instructions": {
|
||||
"type": "array",
|
||||
"description": "Array of decoded instructions matching sampled PCs from pc_sample_host_trap section."
|
||||
},
|
||||
"pc_sample_comments": {
|
||||
"type": "array",
|
||||
"description": "Comments matching assembly instructions from pc_sample_instructions array. If debug symbols are available, comments provide instructions to source-line mapping. Otherwise, a comment is an empty string."
|
||||
}
|
||||
}
|
||||
},
|
||||
@@ -925,7 +933,129 @@
|
||||
"lds_block_size_v"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"pc_sample_host_trap": {
|
||||
"type": "array",
|
||||
"description": "Host Trap PC Sampling records.",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"hw_id": {
|
||||
"type": "object",
|
||||
"description" : "Describes hardware part on which sampled wave was running.",
|
||||
"properties": {
|
||||
"chiplet":{
|
||||
"type": "integer",
|
||||
"description": "Chiplet index."
|
||||
},
|
||||
"wave_id ":{
|
||||
"type": "integer",
|
||||
"description": "Wave slot index."
|
||||
},
|
||||
"simd_id":{
|
||||
"type": "integer",
|
||||
"description": "SIMD index."
|
||||
},
|
||||
"pipe_id ":{
|
||||
"type": "integer",
|
||||
"description": "Pipe index."
|
||||
},
|
||||
"cu_or_wgp_id":{
|
||||
"type": "integer",
|
||||
"description": "Index of compute unit or workgroup processer."
|
||||
},
|
||||
"shader_array_id":{
|
||||
"type": "integer",
|
||||
"description": "Shader array index."
|
||||
},
|
||||
"shader_engine_id":{
|
||||
"type": "integer",
|
||||
"description": "Shader engine index."
|
||||
},
|
||||
"workgroup_id":{
|
||||
"type": "integer",
|
||||
"description": "Workgroup position in the 3D."
|
||||
},
|
||||
"vm_id":{
|
||||
"type": "integer",
|
||||
"description": "Virtual memory ID."
|
||||
},
|
||||
"queue_id":{
|
||||
"type": "integer",
|
||||
"description": "Queue id."
|
||||
},
|
||||
"microengine_id":{
|
||||
"type": "integer",
|
||||
"description": "ACE (microengine) index."
|
||||
}
|
||||
}
|
||||
},
|
||||
"pc": {
|
||||
"type": "object",
|
||||
"description": "Encapsulates information about sampled PC.",
|
||||
"properties": {
|
||||
"code_object_id":{
|
||||
"type": "integer",
|
||||
"description": "Code object id"
|
||||
},
|
||||
"code_object_offset":{
|
||||
"type": "integer",
|
||||
"description": "Offset within the object if the latter is known. Otherwise, virtual address of the PC."
|
||||
}
|
||||
}
|
||||
},
|
||||
"exec_mask":{
|
||||
"type": "integer",
|
||||
"description": "Execution mask indicating active SIMD lanes of sampled wave."
|
||||
},
|
||||
"timestamp":{
|
||||
"type": "integer",
|
||||
"description": "Timestamp."
|
||||
},
|
||||
"dispatch_id":{
|
||||
"type": "integer",
|
||||
"description": "Dispatch id."
|
||||
},
|
||||
"correlation_id": {
|
||||
"type": "object",
|
||||
"description": "Correlation ID information.",
|
||||
"properties": {
|
||||
"internal": {
|
||||
"type": "integer",
|
||||
"description": "Internal correlation ID."
|
||||
},
|
||||
"external": {
|
||||
"type": "integer",
|
||||
"description": "External correlation ID."
|
||||
}
|
||||
}
|
||||
},
|
||||
"rocprofiler_dim3_t": {
|
||||
"type": "object",
|
||||
"description": " Position of the workgroup in 3D grid.",
|
||||
"properties": {
|
||||
"x": {
|
||||
"type": "integer",
|
||||
"description": "Dimension x."
|
||||
},
|
||||
"y": {
|
||||
"type": "integer",
|
||||
"description": "Dimension y."
|
||||
},
|
||||
"z": {
|
||||
"type": "integer",
|
||||
"description": "Dimension z."
|
||||
}
|
||||
}
|
||||
},
|
||||
"wave_in_group": {
|
||||
"type": "integer",
|
||||
"description": "Wave position within the workgroup (0-31)."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
},
|
||||
"buffer_records": {
|
||||
|
||||
@@ -144,7 +144,19 @@
|
||||
"preload":{
|
||||
"type": "array",
|
||||
"description": "Libraries to prepend to LD_PRELOAD (usually for sanitizers)"
|
||||
}
|
||||
},
|
||||
"pc_sampling_unit": {
|
||||
"type": "string",
|
||||
"description": "pc sampling unit"
|
||||
},
|
||||
"pc_sampling_method": {
|
||||
"type": "string",
|
||||
"description": "pc sampling method"
|
||||
},
|
||||
"pc_sampling_interval": {
|
||||
"type": "integer",
|
||||
"description": "pc sampling interval"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -735,6 +735,44 @@ save(ArchiveT& ar, rocprofiler_agent_cache_t data)
|
||||
ROCP_SDK_SAVE_DATA_FIELD(latency);
|
||||
ROCP_SDK_SAVE_DATA_FIELD(type);
|
||||
}
|
||||
template <typename ArchiveT>
|
||||
void
|
||||
save(ArchiveT& ar, rocprofiler_pc_t data)
|
||||
{
|
||||
ROCP_SDK_SAVE_DATA_FIELD(code_object_id);
|
||||
ROCP_SDK_SAVE_DATA_FIELD(code_object_offset);
|
||||
}
|
||||
|
||||
template <typename ArchiveT>
|
||||
void
|
||||
save(ArchiveT& ar, rocprofiler_pc_sampling_hw_id_v0_t data)
|
||||
{
|
||||
ROCP_SDK_SAVE_DATA_BITFIELD("chiplet", chiplet);
|
||||
ROCP_SDK_SAVE_DATA_BITFIELD("wave_id", wave_id);
|
||||
ROCP_SDK_SAVE_DATA_BITFIELD("simd_id", simd_id);
|
||||
ROCP_SDK_SAVE_DATA_BITFIELD("pipe_id", pipe_id);
|
||||
ROCP_SDK_SAVE_DATA_BITFIELD("cu_or_wgp_id", cu_or_wgp_id);
|
||||
ROCP_SDK_SAVE_DATA_BITFIELD("shader_array_id", shader_array_id);
|
||||
ROCP_SDK_SAVE_DATA_BITFIELD("shader_engine_id", shader_engine_id);
|
||||
ROCP_SDK_SAVE_DATA_BITFIELD("workgroup_id ", workgroup_id);
|
||||
ROCP_SDK_SAVE_DATA_BITFIELD("vm_id", vm_id);
|
||||
ROCP_SDK_SAVE_DATA_BITFIELD("queue_id", queue_id);
|
||||
ROCP_SDK_SAVE_DATA_BITFIELD("microengine_id", microengine_id);
|
||||
}
|
||||
|
||||
template <typename ArchiveT>
|
||||
void
|
||||
save(ArchiveT& ar, rocprofiler_pc_sampling_record_host_trap_v0_t data)
|
||||
{
|
||||
ROCP_SDK_SAVE_DATA_FIELD(hw_id);
|
||||
ROCP_SDK_SAVE_DATA_FIELD(pc);
|
||||
ROCP_SDK_SAVE_DATA_FIELD(exec_mask);
|
||||
ROCP_SDK_SAVE_DATA_FIELD(timestamp);
|
||||
ROCP_SDK_SAVE_DATA_FIELD(dispatch_id);
|
||||
ROCP_SDK_SAVE_DATA_VALUE("corr_id", correlation_id);
|
||||
ROCP_SDK_SAVE_DATA_VALUE("wrkgrp_id", workgroup_id);
|
||||
ROCP_SDK_SAVE_DATA_BITFIELD("wave_in_grp", wave_in_group);
|
||||
}
|
||||
|
||||
template <typename ArchiveT>
|
||||
void
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
|
||||
#include "counter_info.hpp"
|
||||
#include "generator.hpp"
|
||||
#include "pc_sample_transform.hpp"
|
||||
#include "statistics.hpp"
|
||||
#include "tmp_file_buffer.hpp"
|
||||
|
||||
@@ -159,5 +160,8 @@ using memory_allocation_buffered_output_t =
|
||||
using counter_records_buffered_output_t =
|
||||
::rocprofiler::tool::buffered_output<rocprofiler::tool::serialized_counter_record_t,
|
||||
domain_type::COUNTER_VALUES>;
|
||||
using pc_sampling_host_trap_buffered_output_t =
|
||||
buffered_output<rocprofiler::tool::rocprofiler_tool_pc_sampling_host_trap_record_t,
|
||||
domain_type::PC_SAMPLING_HOST_TRAP>;
|
||||
} // namespace tool
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -99,17 +99,18 @@ struct csv_encoder
|
||||
}
|
||||
};
|
||||
|
||||
using api_csv_encoder = csv_encoder<7>;
|
||||
using agent_info_csv_encoder = csv_encoder<53>;
|
||||
using kernel_trace_csv_encoder = csv_encoder<18>;
|
||||
using counter_collection_csv_encoder = csv_encoder<18>;
|
||||
using memory_copy_csv_encoder = csv_encoder<7>;
|
||||
using memory_allocation_csv_encoder = csv_encoder<8>;
|
||||
using marker_csv_encoder = csv_encoder<7>;
|
||||
using list_basic_metrics_csv_encoder = csv_encoder<5>;
|
||||
using list_derived_metrics_csv_encoder = csv_encoder<5>;
|
||||
using scratch_memory_encoder = csv_encoder<8>;
|
||||
using stats_csv_encoder = csv_encoder<8>;
|
||||
using api_csv_encoder = csv_encoder<7>;
|
||||
using agent_info_csv_encoder = csv_encoder<53>;
|
||||
using kernel_trace_csv_encoder = csv_encoder<18>;
|
||||
using counter_collection_csv_encoder = csv_encoder<18>;
|
||||
using memory_copy_csv_encoder = csv_encoder<7>;
|
||||
using memory_allocation_csv_encoder = csv_encoder<8>;
|
||||
using marker_csv_encoder = csv_encoder<7>;
|
||||
using list_basic_metrics_csv_encoder = csv_encoder<5>;
|
||||
using list_derived_metrics_csv_encoder = csv_encoder<5>;
|
||||
using scratch_memory_encoder = csv_encoder<8>;
|
||||
using stats_csv_encoder = csv_encoder<8>;
|
||||
using pc_sampling_host_trap_csv_encoder = csv_encoder<6>;
|
||||
} // namespace csv
|
||||
} // namespace tool
|
||||
} // namespace rocprofiler
|
||||
|
||||
Executable → Regular
@@ -57,6 +57,10 @@ DEFINE_BUFFER_TYPE_NAME(MEMORY_ALLOCATION,
|
||||
"memory_allocation",
|
||||
"memory_allocation_stats")
|
||||
DEFINE_BUFFER_TYPE_NAME(COUNTER_VALUES, "COUNTER_VALUES", "counter_values", "no_filename")
|
||||
DEFINE_BUFFER_TYPE_NAME(PC_SAMPLING_HOST_TRAP,
|
||||
"PC_SAMPLING_HOST_TRAP",
|
||||
"pc_sampling_host_trap",
|
||||
"pc_sampling_host_trap_stats")
|
||||
|
||||
#undef DEFINE_BUFFER_TYPE_NAME
|
||||
|
||||
|
||||
@@ -36,6 +36,7 @@ enum class domain_type
|
||||
RCCL,
|
||||
MEMORY_ALLOCATION,
|
||||
COUNTER_VALUES,
|
||||
PC_SAMPLING_HOST_TRAP,
|
||||
LAST,
|
||||
};
|
||||
|
||||
|
||||
Executable → Regular
+59
@@ -717,6 +717,65 @@ generate_csv(const output_config& cfg,
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
generate_csv(const output_config& cfg,
|
||||
const metadata& tool_metadata,
|
||||
const generator<rocprofiler_tool_pc_sampling_host_trap_record_t>& data,
|
||||
const stats_entry_t& stats)
|
||||
{
|
||||
if(data.empty()) return;
|
||||
|
||||
if(cfg.stats && stats)
|
||||
write_stats(get_stats_output_file(cfg, domain_type::PC_SAMPLING_HOST_TRAP), stats.entries);
|
||||
|
||||
auto ofs = tool::csv_output_file{cfg,
|
||||
domain_type::PC_SAMPLING_HOST_TRAP,
|
||||
tool::csv::pc_sampling_host_trap_csv_encoder{},
|
||||
{"Sample_Timestamp",
|
||||
"Exec_Mask",
|
||||
"Dispatch_Id",
|
||||
"Instruction",
|
||||
"Instruction_Comment",
|
||||
"Correlation_Id"}};
|
||||
for(auto ditr : data)
|
||||
{
|
||||
for(const auto& record : data.get(ditr))
|
||||
{
|
||||
if(record.inst_index == -1)
|
||||
{
|
||||
auto row_ss = std::stringstream{};
|
||||
std::string inst_comment =
|
||||
"Unrecognized code object id, physical virtual address of PC:" +
|
||||
std::to_string(record.pc_sample_record.pc.code_object_offset);
|
||||
rocprofiler::tool::csv::pc_sampling_host_trap_csv_encoder::write_row(
|
||||
row_ss,
|
||||
record.pc_sample_record.timestamp,
|
||||
record.pc_sample_record.exec_mask,
|
||||
record.pc_sample_record.dispatch_id,
|
||||
"",
|
||||
inst_comment,
|
||||
record.pc_sample_record.correlation_id.internal);
|
||||
|
||||
ofs << row_ss.str();
|
||||
}
|
||||
else
|
||||
{
|
||||
auto row_ss = std::stringstream{};
|
||||
rocprofiler::tool::csv::pc_sampling_host_trap_csv_encoder::write_row(
|
||||
row_ss,
|
||||
record.pc_sample_record.timestamp,
|
||||
record.pc_sample_record.exec_mask,
|
||||
record.pc_sample_record.dispatch_id,
|
||||
tool_metadata.get_instruction(record.inst_index),
|
||||
tool_metadata.get_comment(record.inst_index),
|
||||
record.pc_sample_record.correlation_id.internal);
|
||||
|
||||
ofs << row_ss.str();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
generate_csv(const output_config& cfg,
|
||||
const metadata& /*tool_metadata*/,
|
||||
|
||||
@@ -92,6 +92,11 @@ generate_csv(const output_config&
|
||||
const metadata& tool_metadata,
|
||||
const generator<rocprofiler_buffer_tracing_memory_allocation_record_t>& data,
|
||||
const stats_entry_t& stats);
|
||||
void
|
||||
generate_csv(const output_config& cfg,
|
||||
const metadata& tool_metadata,
|
||||
const generator<rocprofiler_tool_pc_sampling_host_trap_record_t>& data,
|
||||
const stats_entry_t& stats);
|
||||
|
||||
void
|
||||
generate_csv(const output_config& cfg,
|
||||
|
||||
@@ -124,7 +124,13 @@ write_json(json_output& json_ar,
|
||||
json_ar.startNode();
|
||||
json_ar(cereal::make_nvp("callback_records", callback_name_info));
|
||||
json_ar(cereal::make_nvp("buffer_records", buffer_name_info));
|
||||
json_ar(
|
||||
cereal::make_nvp("pc_sample_instructions", tool_metadata.get_pc_sample_instructions()));
|
||||
json_ar(cereal::make_nvp("pc_sample_comments", tool_metadata.get_pc_sample_comments()));
|
||||
json_ar(cereal::make_nvp("marker_api", marker_msg_data));
|
||||
json_ar(
|
||||
cereal::make_nvp("pc_sample_instructions", tool_metadata.get_pc_sample_instructions()));
|
||||
json_ar(cereal::make_nvp("pc_sample_comments", tool_metadata.get_pc_sample_comments()));
|
||||
|
||||
{
|
||||
auto _extern_corr_id_strings = std::map<size_t, std::string>{};
|
||||
@@ -178,7 +184,8 @@ write_json(json_output& json_ar,
|
||||
generator<rocprofiler_buffer_tracing_marker_api_record_t> marker_api_gen,
|
||||
generator<rocprofiler_buffer_tracing_scratch_memory_record_t> scratch_memory_gen,
|
||||
generator<rocprofiler_buffer_tracing_rccl_api_record_t> rccl_api_gen,
|
||||
generator<rocprofiler_buffer_tracing_memory_allocation_record_t> memory_allocation_gen)
|
||||
generator<rocprofiler_buffer_tracing_memory_allocation_record_t> memory_allocation_gen,
|
||||
generator<rocprofiler_tool_pc_sampling_host_trap_record_t> pc_sampling_gen)
|
||||
|
||||
{
|
||||
// summary
|
||||
@@ -219,6 +226,7 @@ write_json(json_output& json_ar,
|
||||
json_ar(cereal::make_nvp("memory_copy", memory_copy_gen));
|
||||
json_ar(cereal::make_nvp("memory_allocation", memory_allocation_gen));
|
||||
json_ar(cereal::make_nvp("scratch_memory", scratch_memory_gen));
|
||||
json_ar(cereal::make_nvp("pc_sample_host_trap", pc_sampling_gen));
|
||||
json_ar.finishNode();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -93,6 +93,7 @@ write_json(json_output& json
|
||||
generator<rocprofiler_buffer_tracing_marker_api_record_t> marker_api_gen,
|
||||
generator<rocprofiler_buffer_tracing_scratch_memory_record_t> scratch_memory_gen,
|
||||
generator<rocprofiler_buffer_tracing_rccl_api_record_t> rccl_api_gen,
|
||||
generator<rocprofiler_buffer_tracing_memory_allocation_record_t> memory_allocation_gen);
|
||||
generator<rocprofiler_buffer_tracing_memory_allocation_record_t> memory_allocation_gen,
|
||||
generator<rocprofiler_tool_pc_sampling_host_trap_record_t> pc_sampling_gen);
|
||||
} // namespace tool
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -411,5 +411,13 @@ generate_stats(const output_config& cfg,
|
||||
|
||||
if(cfg.stats_summary) generate_stats(cfg, _os, "SUMMARY", data_v, _indent);
|
||||
}
|
||||
|
||||
stats_entry_t
|
||||
generate_stats(const output_config& /* cfg*/,
|
||||
const metadata& /*tool_metadata*/,
|
||||
const generator<rocprofiler_tool_pc_sampling_host_trap_record_t>& /*data*/)
|
||||
{
|
||||
return stats_entry_t{};
|
||||
}
|
||||
} // namespace tool
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -75,6 +75,10 @@ generate_stats(const output_config& cfg,
|
||||
const metadata& tool_metadata,
|
||||
const generator<rocprofiler_buffer_tracing_memory_allocation_record_t>& data);
|
||||
|
||||
stats_entry_t
|
||||
generate_stats(const output_config& cfg,
|
||||
const metadata& tool_metadata,
|
||||
const generator<rocprofiler_tool_pc_sampling_host_trap_record_t>& data);
|
||||
void
|
||||
generate_stats(const output_config& cfg,
|
||||
const metadata& tool_metadata,
|
||||
|
||||
@@ -47,6 +47,20 @@ dimensions_info_callback(rocprofiler_counter_id_t /*id*/,
|
||||
dimensions_info->emplace_back(dim_info[j]);
|
||||
return ROCPROFILER_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
rocprofiler_status_t
|
||||
query_pc_sampling_configuration(const rocprofiler_pc_sampling_configuration_t* configs,
|
||||
long unsigned int num_config,
|
||||
void* user_data)
|
||||
{
|
||||
auto* avail_configs =
|
||||
static_cast<std::vector<rocprofiler_pc_sampling_configuration_t>*>(user_data);
|
||||
for(size_t i = 0; i < num_config; i++)
|
||||
{
|
||||
avail_configs->emplace_back(configs[i]);
|
||||
}
|
||||
return ROCPROFILER_STATUS_SUCCESS;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
kernel_symbol_info::kernel_symbol_info()
|
||||
@@ -78,7 +92,14 @@ metadata::metadata(inprocess)
|
||||
_gpu_agents.reserve(agents.size());
|
||||
for(auto& itr : agents)
|
||||
{
|
||||
if(itr.type == ROCPROFILER_AGENT_TYPE_GPU) _gpu_agents.emplace_back(&itr);
|
||||
if(itr.type == ROCPROFILER_AGENT_TYPE_GPU)
|
||||
{
|
||||
_gpu_agents.emplace_back(&itr);
|
||||
auto pc_configs = std::vector<rocprofiler_pc_sampling_configuration_t>{};
|
||||
rocprofiler_query_pc_sampling_agent_configurations(
|
||||
itr.id, query_pc_sampling_configuration, &pc_configs);
|
||||
agent_pc_sample_config_info.emplace(itr.id, pc_configs);
|
||||
}
|
||||
}
|
||||
|
||||
// make sure they are sorted by node id
|
||||
@@ -112,6 +133,7 @@ void metadata::init(inprocess)
|
||||
void* user_data) {
|
||||
auto* data_v = static_cast<agent_counter_info_map_t*>(user_data);
|
||||
data_v->emplace(id, counter_info_vec_t{});
|
||||
|
||||
for(size_t i = 0; i < num_counters; ++i)
|
||||
{
|
||||
auto _info = rocprofiler_counter_info_v0_t{};
|
||||
@@ -260,6 +282,17 @@ metadata::get_gpu_agents() const
|
||||
return _data;
|
||||
}
|
||||
|
||||
pc_sample_config_vec_t
|
||||
metadata::get_pc_sample_config_info(rocprofiler_agent_id_t _val) const
|
||||
{
|
||||
auto _ret = pc_sample_config_vec_t{};
|
||||
auto pc_sample_config = agent_pc_sample_config_info.at(_val);
|
||||
for(const auto& itr : pc_sample_config)
|
||||
_ret.emplace_back(itr);
|
||||
|
||||
return _ret;
|
||||
}
|
||||
|
||||
counter_info_vec_t
|
||||
metadata::get_counter_info() const
|
||||
{
|
||||
@@ -417,5 +450,62 @@ metadata::get_string_entry(size_t key) const
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int64_t
|
||||
metadata::get_instruction_index(rocprofiler_pc_t record)
|
||||
{
|
||||
inst_t ins;
|
||||
ins.code_object_id = record.code_object_id;
|
||||
ins.code_object_offset = record.code_object_offset;
|
||||
auto itr = indexes.find(ins);
|
||||
if(itr != indexes.end()) return itr->second;
|
||||
auto idx = instruction_decoder.size();
|
||||
auto pc_instruction = decode_instruction(record);
|
||||
instruction_decoder.emplace_back(pc_instruction->inst);
|
||||
instruction_comment.emplace_back(pc_instruction->comment);
|
||||
indexes.emplace(ins, idx);
|
||||
return idx;
|
||||
}
|
||||
|
||||
void
|
||||
metadata::add_decoder(rocprofiler_code_object_info_t* obj_data)
|
||||
{
|
||||
if(obj_data->storage_type == ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_FILE)
|
||||
{
|
||||
decoder.wlock(
|
||||
[](auto& _decoder, rocprofiler_code_object_info_t* obj_data_v) {
|
||||
_decoder.addDecoder(obj_data_v->uri,
|
||||
obj_data_v->code_object_id,
|
||||
obj_data_v->load_delta,
|
||||
obj_data_v->load_size);
|
||||
},
|
||||
obj_data);
|
||||
}
|
||||
else
|
||||
{
|
||||
decoder.wlock(
|
||||
[](auto& _decoder, rocprofiler_code_object_info_t* obj_data_v) {
|
||||
_decoder.addDecoder(
|
||||
// NOLINTBEGIN(performance-no-int-to-ptr)
|
||||
reinterpret_cast<const void*>(obj_data_v->memory_base),
|
||||
// NOLINTEND(performance-no-int-to-ptr)
|
||||
obj_data_v->memory_size,
|
||||
obj_data_v->code_object_id,
|
||||
obj_data_v->load_delta,
|
||||
obj_data_v->load_size);
|
||||
},
|
||||
obj_data);
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<instruction_t>
|
||||
metadata::decode_instruction(rocprofiler_pc_t pc)
|
||||
{
|
||||
return decoder.wlock(
|
||||
[](auto& _decoder, uint64_t id, uint64_t addr) { return _decoder.get(id, addr); },
|
||||
pc.code_object_id,
|
||||
pc.code_object_offset);
|
||||
}
|
||||
|
||||
} // namespace tool
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -25,6 +25,7 @@
|
||||
#include "agent_info.hpp"
|
||||
#include "counter_info.hpp"
|
||||
#include "kernel_symbol_info.hpp"
|
||||
#include "pc_sample_transform.hpp"
|
||||
|
||||
#include "lib/common/container/small_vector.hpp"
|
||||
#include "lib/common/demangle.hpp"
|
||||
@@ -36,6 +37,7 @@
|
||||
#include <rocprofiler-sdk/callback_tracing.h>
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/rocprofiler.h>
|
||||
#include <rocprofiler-sdk/cxx/codeobj/code_printing.hpp>
|
||||
#include <rocprofiler-sdk/cxx/hash.hpp>
|
||||
#include <rocprofiler-sdk/cxx/name_info.hpp>
|
||||
#include <rocprofiler-sdk/cxx/operators.hpp>
|
||||
@@ -71,7 +73,8 @@ using marker_message_ordered_map_t = std::map<uint64_t, std::string>;
|
||||
using string_entry_map_t = std::unordered_map<size_t, std::unique_ptr<std::string>>;
|
||||
using counter_dimension_vec_t = std::vector<rocprofiler_record_dimension_info_t>;
|
||||
using external_corr_id_set_t = std::unordered_set<uint64_t>;
|
||||
|
||||
using code_obj_decoder_t = rocprofiler::sdk::codeobj::disassembly::CodeobjAddressTranslate;
|
||||
using instruction_t = rocprofiler::sdk::codeobj::disassembly::Instruction;
|
||||
template <typename Tp>
|
||||
using synced_map = common::Synchronized<Tp, true>;
|
||||
|
||||
@@ -82,19 +85,21 @@ struct metadata
|
||||
struct inprocess
|
||||
{};
|
||||
|
||||
pid_t process_id = 0;
|
||||
uint64_t process_start_ns = 0;
|
||||
uint64_t process_end_ns = 0;
|
||||
agent_info_vec_t agents = {};
|
||||
agent_info_map_t agents_map = {};
|
||||
agent_counter_info_map_t agent_counter_info = {};
|
||||
sdk::buffer_name_info buffer_names = {};
|
||||
sdk::callback_name_info callback_names = {};
|
||||
synced_map<code_object_data_map_t> code_objects = {};
|
||||
synced_map<kernel_symbol_data_map_t> kernel_symbols = {};
|
||||
synced_map<marker_message_map_t> marker_messages = {};
|
||||
synced_map<string_entry_map_t> string_entries = {};
|
||||
synced_map<external_corr_id_set_t> external_corr_ids = {};
|
||||
pid_t process_id = 0;
|
||||
uint64_t process_start_ns = 0;
|
||||
uint64_t process_end_ns = 0;
|
||||
agent_info_vec_t agents = {};
|
||||
agent_info_map_t agents_map = {};
|
||||
agent_counter_info_map_t agent_counter_info = {};
|
||||
agent_pc_sample_config_info_map_t agent_pc_sample_config_info = {};
|
||||
|
||||
sdk::buffer_name_info buffer_names = {};
|
||||
sdk::callback_name_info callback_names = {};
|
||||
synced_map<code_object_data_map_t> code_objects = {};
|
||||
synced_map<kernel_symbol_data_map_t> kernel_symbols = {};
|
||||
synced_map<marker_message_map_t> marker_messages = {};
|
||||
synced_map<string_entry_map_t> string_entries = {};
|
||||
synced_map<external_corr_id_set_t> external_corr_ids = {};
|
||||
|
||||
metadata() = default;
|
||||
metadata(inprocess);
|
||||
@@ -119,6 +124,13 @@ struct metadata
|
||||
agent_info_ptr_vec_t get_gpu_agents() const;
|
||||
counter_info_vec_t get_counter_info() const;
|
||||
counter_dimension_vec_t get_counter_dimension_info() const;
|
||||
pc_sample_config_vec_t get_pc_sample_config_info(rocprofiler_agent_id_t _val) const;
|
||||
std::vector<std::string> get_pc_sample_instructions() const { return instruction_decoder; }
|
||||
std::vector<std::string> get_pc_sample_comments() const { return instruction_comment; }
|
||||
std::string_view get_instruction(int64_t index) const { return instruction_decoder.at(index); }
|
||||
std::string_view get_comment(int64_t index) const { return instruction_comment.at(index); }
|
||||
int64_t get_instruction_index(rocprofiler_pc_t record);
|
||||
void add_decoder(rocprofiler_code_object_info_t* obj_data_v);
|
||||
|
||||
template <typename Tp>
|
||||
Tp get_marker_messages(Tp&&);
|
||||
@@ -141,7 +153,13 @@ struct metadata
|
||||
const std::string* get_string_entry(size_t key) const;
|
||||
|
||||
private:
|
||||
bool inprocess_init = false;
|
||||
bool inprocess_init = false;
|
||||
std::unique_ptr<instruction_t> decode_instruction(rocprofiler_pc_t pc);
|
||||
synced_map<code_obj_decoder_t> decoder = {};
|
||||
// TODO: We may have to reserve the vector size based on map size
|
||||
std::vector<std::string> instruction_decoder = {};
|
||||
std::vector<std::string> instruction_comment = {};
|
||||
std::map<inst_t, size_t> indexes = {};
|
||||
};
|
||||
|
||||
template <typename Tp>
|
||||
|
||||
@@ -0,0 +1,83 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/cxx/codeobj/code_printing.hpp>
|
||||
#include <rocprofiler-sdk/cxx/serialization.hpp>
|
||||
|
||||
#include "lib/common/static_object.hpp"
|
||||
#include "lib/common/synchronized.hpp"
|
||||
|
||||
#include <unordered_map>
|
||||
|
||||
using pc_sample_config_vec_t = std::vector<rocprofiler_pc_sampling_configuration_t>;
|
||||
using agent_pc_sample_config_info_map_t =
|
||||
std::unordered_map<rocprofiler_agent_id_t, pc_sample_config_vec_t>;
|
||||
|
||||
namespace rocprofiler
|
||||
{
|
||||
namespace tool
|
||||
{
|
||||
struct inst_t
|
||||
{
|
||||
uint64_t code_object_id;
|
||||
uint64_t code_object_offset;
|
||||
|
||||
bool operator==(const inst_t& inst) const
|
||||
{
|
||||
return this->code_object_id == inst.code_object_id &&
|
||||
this->code_object_offset == inst.code_object_offset;
|
||||
}
|
||||
|
||||
bool operator<(const inst_t& b) const
|
||||
{
|
||||
if(this->code_object_id == b.code_object_id)
|
||||
return this->code_object_offset < b.code_object_offset;
|
||||
return this->code_object_id < b.code_object_id;
|
||||
};
|
||||
};
|
||||
|
||||
// TODO:: Check if we can template this structure
|
||||
struct rocprofiler_tool_pc_sampling_host_trap_record_t
|
||||
{
|
||||
rocprofiler_pc_sampling_record_host_trap_v0_t pc_sample_record;
|
||||
int64_t inst_index;
|
||||
|
||||
rocprofiler_tool_pc_sampling_host_trap_record_t(
|
||||
rocprofiler_pc_sampling_record_host_trap_v0_t record,
|
||||
int64_t index)
|
||||
: pc_sample_record(record)
|
||||
, inst_index(index)
|
||||
{}
|
||||
|
||||
template <typename ArchiveT>
|
||||
void save(ArchiveT& ar) const
|
||||
{
|
||||
ar(cereal::make_nvp("record", pc_sample_record));
|
||||
ar(cereal::make_nvp("inst_index", inst_index));
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace tool
|
||||
} // namespace rocprofiler
|
||||
@@ -21,7 +21,9 @@ target_link_libraries(
|
||||
rocprofiler-sdk::rocprofiler-sdk-output-library
|
||||
rocprofiler-sdk::rocprofiler-sdk-cereal
|
||||
rocprofiler-sdk::rocprofiler-sdk-perfetto
|
||||
rocprofiler-sdk::rocprofiler-sdk-otf2)
|
||||
rocprofiler-sdk::rocprofiler-sdk-otf2
|
||||
rocprofiler-sdk::rocprofiler-sdk-dw
|
||||
rocprofiler-sdk::rocprofiler-sdk-amd-comgr)
|
||||
|
||||
set_target_properties(
|
||||
rocprofiler-sdk-tool
|
||||
|
||||
@@ -195,6 +195,22 @@ config::config()
|
||||
, counters{parse_counters(get_env("ROCPROF_COUNTERS", std::string{}))}
|
||||
{
|
||||
if(kernel_filter_include.empty()) kernel_filter_include = std::string{".*"};
|
||||
|
||||
std::unordered_map<std::string_view, rocprofiler_pc_sampling_unit_t> pc_sampling_unit_map = {
|
||||
{"none", ROCPROFILER_PC_SAMPLING_UNIT_NONE},
|
||||
{"instructions", ROCPROFILER_PC_SAMPLING_UNIT_INSTRUCTIONS},
|
||||
{"cycles", ROCPROFILER_PC_SAMPLING_UNIT_CYCLES},
|
||||
{"time", ROCPROFILER_PC_SAMPLING_UNIT_TIME}};
|
||||
|
||||
std::unordered_map<std::string_view, rocprofiler_pc_sampling_method_t> pc_sampling_method_map =
|
||||
{{"none", ROCPROFILER_PC_SAMPLING_METHOD_NONE},
|
||||
{"stochastic", ROCPROFILER_PC_SAMPLING_METHOD_STOCHASTIC},
|
||||
{"host_trap", ROCPROFILER_PC_SAMPLING_METHOD_HOST_TRAP}};
|
||||
|
||||
pc_sampling_method_value = pc_sampling_method_map.at(pc_sampling_method);
|
||||
if(pc_sampling_method_value == ROCPROFILER_PC_SAMPLING_METHOD_HOST_TRAP)
|
||||
pc_sampling_host_trap = true;
|
||||
pc_sampling_unit_value = pc_sampling_unit_map.at(pc_sampling_unit);
|
||||
}
|
||||
|
||||
std::string
|
||||
|
||||
@@ -75,29 +75,36 @@ struct config : output_config
|
||||
config& operator=(const config&) = default;
|
||||
config& operator=(config&&) noexcept = default;
|
||||
|
||||
bool demangle = get_env("ROCPROF_DEMANGLE_KERNELS", true);
|
||||
bool truncate = get_env("ROCPROF_TRUNCATE_KERNELS", false);
|
||||
bool kernel_trace = get_env("ROCPROF_KERNEL_TRACE", false);
|
||||
bool hsa_core_api_trace = get_env("ROCPROF_HSA_CORE_API_TRACE", false);
|
||||
bool hsa_amd_ext_api_trace = get_env("ROCPROF_HSA_AMD_EXT_API_TRACE", false);
|
||||
bool hsa_image_ext_api_trace = get_env("ROCPROF_HSA_IMAGE_EXT_API_TRACE", false);
|
||||
bool hsa_finalizer_ext_api_trace = get_env("ROCPROF_HSA_FINALIZER_EXT_API_TRACE", false);
|
||||
bool marker_api_trace = get_env("ROCPROF_MARKER_API_TRACE", false);
|
||||
bool memory_copy_trace = get_env("ROCPROF_MEMORY_COPY_TRACE", false);
|
||||
bool memory_allocation_trace = get_env("ROCPROF_MEMORY_ALLOCATION_TRACE", false);
|
||||
bool scratch_memory_trace = get_env("ROCPROF_SCRATCH_MEMORY_TRACE", false);
|
||||
bool counter_collection = get_env("ROCPROF_COUNTER_COLLECTION", false);
|
||||
bool hip_runtime_api_trace = get_env("ROCPROF_HIP_RUNTIME_API_TRACE", false);
|
||||
bool hip_compiler_api_trace = get_env("ROCPROF_HIP_COMPILER_API_TRACE", false);
|
||||
bool rccl_api_trace = get_env("ROCPROF_RCCL_API_TRACE", false);
|
||||
bool list_metrics = get_env("ROCPROF_LIST_METRICS", false);
|
||||
bool list_metrics_output_file = get_env("ROCPROF_OUTPUT_LIST_METRICS_FILE", false);
|
||||
bool demangle = get_env("ROCPROF_DEMANGLE_KERNELS", true);
|
||||
bool truncate = get_env("ROCPROF_TRUNCATE_KERNELS", false);
|
||||
bool kernel_trace = get_env("ROCPROF_KERNEL_TRACE", false);
|
||||
bool hsa_core_api_trace = get_env("ROCPROF_HSA_CORE_API_TRACE", false);
|
||||
bool hsa_amd_ext_api_trace = get_env("ROCPROF_HSA_AMD_EXT_API_TRACE", false);
|
||||
bool hsa_image_ext_api_trace = get_env("ROCPROF_HSA_IMAGE_EXT_API_TRACE", false);
|
||||
bool hsa_finalizer_ext_api_trace = get_env("ROCPROF_HSA_FINALIZER_EXT_API_TRACE", false);
|
||||
bool marker_api_trace = get_env("ROCPROF_MARKER_API_TRACE", false);
|
||||
bool memory_copy_trace = get_env("ROCPROF_MEMORY_COPY_TRACE", false);
|
||||
bool memory_allocation_trace = get_env("ROCPROF_MEMORY_ALLOCATION_TRACE", false);
|
||||
bool scratch_memory_trace = get_env("ROCPROF_SCRATCH_MEMORY_TRACE", false);
|
||||
bool counter_collection = get_env("ROCPROF_COUNTER_COLLECTION", false);
|
||||
bool hip_runtime_api_trace = get_env("ROCPROF_HIP_RUNTIME_API_TRACE", false);
|
||||
bool hip_compiler_api_trace = get_env("ROCPROF_HIP_COMPILER_API_TRACE", false);
|
||||
bool rccl_api_trace = get_env("ROCPROF_RCCL_API_TRACE", false);
|
||||
bool list_metrics = get_env("ROCPROF_LIST_METRICS", false);
|
||||
bool list_metrics_output_file = get_env("ROCPROF_OUTPUT_LIST_METRICS_FILE", false);
|
||||
bool pc_sampling_host_trap = false;
|
||||
size_t pc_sampling_interval = get_env("ROCPROF_PC_SAMPLING_INTERVAL", 1);
|
||||
rocprofiler_pc_sampling_method_t pc_sampling_method_value = ROCPROFILER_PC_SAMPLING_METHOD_NONE;
|
||||
rocprofiler_pc_sampling_unit_t pc_sampling_unit_value = ROCPROFILER_PC_SAMPLING_UNIT_NONE;
|
||||
|
||||
int mpi_size = get_mpi_size();
|
||||
int mpi_rank = get_mpi_rank();
|
||||
std::string stats_summary_unit = get_env("ROCPROF_STATS_SUMMARY_UNITS", "nsec");
|
||||
int mpi_size = get_mpi_size();
|
||||
int mpi_rank = get_mpi_rank();
|
||||
|
||||
std::string kernel_filter_include = get_env("ROCPROF_KERNEL_FILTER_INCLUDE_REGEX", ".*");
|
||||
std::string kernel_filter_exclude = get_env("ROCPROF_KERNEL_FILTER_EXCLUDE_REGEX", "");
|
||||
std::string pc_sampling_method = get_env("ROCPROF_PC_SAMPLING_METHOD", "none");
|
||||
std::string pc_sampling_unit = get_env("ROCPROF_PC_SAMPLING_UNIT", "none");
|
||||
std::string extra_counters_contents = get_env("ROCPROF_EXTRA_COUNTERS_CONTENTS", "");
|
||||
|
||||
std::unordered_set<uint32_t> kernel_filter_range = {};
|
||||
|
||||
Executable → Regular
+124
-4
@@ -157,17 +157,19 @@ struct buffer_ids
|
||||
rocprofiler_buffer_id_t counter_collection = {};
|
||||
rocprofiler_buffer_id_t scratch_memory = {};
|
||||
rocprofiler_buffer_id_t rccl_api_trace = {};
|
||||
rocprofiler_buffer_id_t pc_sampling_host_trap = {};
|
||||
|
||||
auto as_array() const
|
||||
{
|
||||
return std::array<rocprofiler_buffer_id_t, 8>{hsa_api_trace,
|
||||
return std::array<rocprofiler_buffer_id_t, 9>{hsa_api_trace,
|
||||
hip_api_trace,
|
||||
kernel_trace,
|
||||
memory_copy_trace,
|
||||
memory_allocation_trace,
|
||||
counter_collection,
|
||||
scratch_memory,
|
||||
rccl_api_trace};
|
||||
rccl_api_trace,
|
||||
pc_sampling_host_trap};
|
||||
}
|
||||
};
|
||||
|
||||
@@ -547,6 +549,10 @@ code_object_tracing_callback(rocprofiler_callback_tracing_record_t record,
|
||||
auto* obj_data = static_cast<tool::rocprofiler_code_object_info_t*>(record.payload);
|
||||
|
||||
CHECK_NOTNULL(tool_metadata)->add_code_object(*obj_data);
|
||||
if(tool::get_config().pc_sampling_host_trap)
|
||||
{
|
||||
CHECK_NOTNULL(tool_metadata)->add_decoder(obj_data);
|
||||
}
|
||||
}
|
||||
else if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD)
|
||||
{
|
||||
@@ -801,6 +807,58 @@ get_device_counting_service(rocprofiler_agent_id_t agent_id)
|
||||
return profile;
|
||||
}
|
||||
|
||||
int64_t
|
||||
get_instruction_index(rocprofiler_pc_t pc)
|
||||
{
|
||||
if(pc.code_object_id == ROCPROFILER_CODE_OBJECT_ID_NONE)
|
||||
return -1;
|
||||
else
|
||||
return CHECK_NOTNULL(tool_metadata)->get_instruction_index(pc);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void
|
||||
rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /* context_id*/,
|
||||
rocprofiler_buffer_id_t /* buffer_id*/,
|
||||
rocprofiler_record_header_t** headers,
|
||||
size_t num_headers,
|
||||
void* /*data*/,
|
||||
uint64_t /* drop_count*/)
|
||||
{
|
||||
if(!headers) return;
|
||||
|
||||
for(size_t i = 0; i < num_headers; i++)
|
||||
{
|
||||
auto* cur_header = headers[i];
|
||||
|
||||
if(cur_header == nullptr)
|
||||
{
|
||||
throw std::runtime_error{
|
||||
"rocprofiler provided a null pointer to header. this should never happen"};
|
||||
}
|
||||
else if(cur_header->category == ROCPROFILER_BUFFER_CATEGORY_PC_SAMPLING)
|
||||
{
|
||||
if(cur_header->kind == ROCPROFILER_PC_SAMPLING_RECORD_HOST_TRAP_V0_SAMPLE)
|
||||
{
|
||||
auto* pc_sample = static_cast<rocprofiler_pc_sampling_record_host_trap_v0_t*>(
|
||||
cur_header->payload);
|
||||
|
||||
auto pc_sample_tool_record =
|
||||
rocprofiler::tool::rocprofiler_tool_pc_sampling_host_trap_record_t(
|
||||
*pc_sample, get_instruction_index(pc_sample->pc));
|
||||
|
||||
rocprofiler::tool::write_ring_buffer(pc_sample_tool_record,
|
||||
domain_type::PC_SAMPLING_HOST_TRAP);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ROCP_FATAL << "unexpected rocprofiler_record_header_t category + kind";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
dispatch_callback(rocprofiler_dispatch_counting_service_data_t dispatch_data,
|
||||
rocprofiler_profile_config_id_t* config,
|
||||
@@ -1018,6 +1076,26 @@ finalize_rocprofv3(std::string_view context)
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
if_pc_sample_config_match(rocprofiler_agent_id_t agent_id,
|
||||
rocprofiler_pc_sampling_method_t pc_sampling_method,
|
||||
rocprofiler_pc_sampling_unit_t pc_sampling_unit,
|
||||
uint64_t pc_sampling_interval)
|
||||
{
|
||||
auto pc_sampling_config = CHECK_NOTNULL(tool_metadata)->get_pc_sample_config_info(agent_id);
|
||||
if(!pc_sampling_config.empty())
|
||||
{
|
||||
for(auto config : pc_sampling_config)
|
||||
{
|
||||
if(config.method == pc_sampling_method && config.unit == pc_sampling_unit &&
|
||||
config.min_interval <= pc_sampling_interval &&
|
||||
config.max_interval >= pc_sampling_interval)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
int
|
||||
tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
|
||||
{
|
||||
@@ -1277,6 +1355,44 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
|
||||
"Could not configure external correlation id request service");
|
||||
}
|
||||
|
||||
if(tool::get_config().pc_sampling_host_trap)
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(),
|
||||
buffer_size,
|
||||
buffer_watermark,
|
||||
ROCPROFILER_BUFFER_POLICY_LOSSLESS,
|
||||
rocprofiler_pc_sampling_callback,
|
||||
tool_data,
|
||||
&get_buffers().pc_sampling_host_trap),
|
||||
"buffer creation");
|
||||
bool config_match_found = false;
|
||||
auto agent_ptr_vec = get_gpu_agents();
|
||||
for(auto& itr : agent_ptr_vec)
|
||||
{
|
||||
auto method = static_cast<rocprofiler_pc_sampling_method_t>(
|
||||
tool::get_config().pc_sampling_method_value);
|
||||
auto unit = static_cast<rocprofiler_pc_sampling_unit_t>(
|
||||
tool::get_config().pc_sampling_unit_value);
|
||||
if(if_pc_sample_config_match(
|
||||
itr->id, method, unit, tool::get_config().pc_sampling_interval))
|
||||
{
|
||||
config_match_found = true;
|
||||
int flags = 0;
|
||||
ROCPROFILER_CALL(rocprofiler_configure_pc_sampling_service(
|
||||
get_client_ctx(),
|
||||
itr->id,
|
||||
method,
|
||||
unit,
|
||||
tool::get_config().pc_sampling_interval,
|
||||
get_buffers().pc_sampling_host_trap,
|
||||
flags),
|
||||
"configure PC sampling");
|
||||
}
|
||||
}
|
||||
if(!config_match_found)
|
||||
ROCP_ERROR << "Given PC sampling configuration is not supported on any of the agents";
|
||||
}
|
||||
|
||||
for(auto itr : get_buffers().as_array())
|
||||
{
|
||||
if(itr.handle > 0)
|
||||
@@ -1380,6 +1496,8 @@ tool_fini(void* /*tool_data*/)
|
||||
tool::memory_allocation_buffered_output_t{tool::get_config().memory_allocation_trace};
|
||||
auto counters_records_output =
|
||||
tool::counter_records_buffered_output_t{tool::get_config().counter_collection};
|
||||
auto pc_sampling_host_trap_output =
|
||||
tool::pc_sampling_host_trap_buffered_output_t{tool::get_config().pc_sampling_host_trap};
|
||||
|
||||
auto node_id_sort = [](const auto& lhs, const auto& rhs) { return lhs.node_id < rhs.node_id; };
|
||||
|
||||
@@ -1402,6 +1520,7 @@ tool_fini(void* /*tool_data*/)
|
||||
generate_output(rccl_output, contributions);
|
||||
generate_output(counters_output, contributions);
|
||||
generate_output(scratch_memory_output, contributions);
|
||||
generate_output(pc_sampling_host_trap_output, contributions);
|
||||
|
||||
if(tool::get_config().stats && tool::get_config().csv_output)
|
||||
{
|
||||
@@ -1426,7 +1545,8 @@ tool_fini(void* /*tool_data*/)
|
||||
marker_output.get_generator(),
|
||||
scratch_memory_output.get_generator(),
|
||||
rccl_output.get_generator(),
|
||||
memory_allocation_output.get_generator());
|
||||
memory_allocation_output.get_generator(),
|
||||
pc_sampling_host_trap_output.get_generator());
|
||||
json_ar.finish_process();
|
||||
|
||||
tool::close_json(json_ar);
|
||||
@@ -1489,6 +1609,7 @@ tool_fini(void* /*tool_data*/)
|
||||
destroy_output(scratch_memory_output);
|
||||
destroy_output(rccl_output);
|
||||
destroy_output(counters_records_output);
|
||||
destroy_output(pc_sampling_host_trap_output);
|
||||
|
||||
if(destructors)
|
||||
{
|
||||
@@ -1502,7 +1623,6 @@ tool_fini(void* /*tool_data*/)
|
||||
__gcov_dump();
|
||||
#endif
|
||||
}
|
||||
} // namespace
|
||||
|
||||
std::vector<rocprofiler_record_dimension_info_t>
|
||||
get_tool_counter_dimension_info()
|
||||
|
||||
@@ -25,3 +25,4 @@ add_subdirectory(page-migration)
|
||||
add_subdirectory(hsa-queue-dependency)
|
||||
add_subdirectory(hip-graph)
|
||||
add_subdirectory(hsa-memory-allocation)
|
||||
add_subdirectory(pc-sampling)
|
||||
|
||||
@@ -0,0 +1,11 @@
|
||||
#
|
||||
# Integration test applications
|
||||
#
|
||||
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-tests-bin LANGUAGES C CXX)
|
||||
|
||||
set(CMAKE_BUILD_RPATH "\$ORIGIN:\$ORIGIN/../lib")
|
||||
|
||||
# applications used by integration tests which DO NOT link to rocprofiler-sdk-roctx
|
||||
add_subdirectory(exec-mask-manipulation)
|
||||
+43
@@ -0,0 +1,43 @@
|
||||
#
|
||||
#
|
||||
#
|
||||
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
|
||||
|
||||
if(NOT CMAKE_HIP_COMPILER)
|
||||
find_program(
|
||||
amdclangpp_EXECUTABLE
|
||||
NAMES amdclang++
|
||||
HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
|
||||
PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
|
||||
PATH_SUFFIXES bin llvm/bin NO_CACHE)
|
||||
mark_as_advanced(amdclangpp_EXECUTABLE)
|
||||
|
||||
if(amdclangpp_EXECUTABLE)
|
||||
set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
project(rocprofiler-tests-bin-transpose LANGUAGES CXX HIP)
|
||||
|
||||
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
|
||||
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
|
||||
set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_HIP_STANDARD 17)
|
||||
set(CMAKE_HIP_EXTENSIONS OFF)
|
||||
set(CMAKE_HIP_STANDARD_REQUIRED ON)
|
||||
|
||||
set_source_files_properties(exec_mask_manipulation.cpp PROPERTIES LANGUAGE HIP)
|
||||
add_executable(exec-mask-manipulation)
|
||||
target_sources(exec-mask-manipulation PRIVATE exec_mask_manipulation.cpp)
|
||||
# debug symbols required for PC sampling decoding validation
|
||||
target_compile_options(exec-mask-manipulation PRIVATE -W -Wall -Wextra -Wpedantic
|
||||
-Wshadow -Werror -g)
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
target_link_libraries(exec-mask-manipulation PRIVATE Threads::Threads)
|
||||
+537
@@ -0,0 +1,537 @@
|
||||
/*
|
||||
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#define ITER_NUM 16 * 1024
|
||||
#define BLOCK_SIZE 1024
|
||||
|
||||
#define HIP_API_CALL(CALL) \
|
||||
{ \
|
||||
hipError_t error_ = (CALL); \
|
||||
if(error_ != hipSuccess) \
|
||||
{ \
|
||||
auto _hip_api_print_lk = auto_lock_t{print_lock}; \
|
||||
fprintf(stderr, \
|
||||
"%s:%d :: HIP error : %s\n", \
|
||||
__FILE__, \
|
||||
__LINE__, \
|
||||
hipGetErrorString(error_)); \
|
||||
throw std::runtime_error("hip_api_call"); \
|
||||
} \
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
using auto_lock_t = std::unique_lock<std::mutex>;
|
||||
auto print_lock = std::mutex{};
|
||||
|
||||
void
|
||||
check_hip_error(void);
|
||||
} // namespace
|
||||
|
||||
// ======================================================
|
||||
__global__ void
|
||||
kernel1(const int c)
|
||||
{
|
||||
int a = 0;
|
||||
#pragma nounroll
|
||||
for(int i = 0; i < ITER_NUM; i++)
|
||||
{
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
asm volatile("v_mov_b32 %0 %1\n" : "=v"(a) : "s"(c));
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void
|
||||
kernel2(const int c)
|
||||
{
|
||||
int a = 0;
|
||||
#pragma nounroll
|
||||
for(int i = 0; i < ITER_NUM; i++)
|
||||
{
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
asm volatile("s_mov_b32 %0 %1\n" : "=s"(a) : "s"(c));
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void
|
||||
kernel3(const float c)
|
||||
{
|
||||
double a = threadIdx.x;
|
||||
float i = 0;
|
||||
float d = 0;
|
||||
float e = 0;
|
||||
int tid_even = threadIdx.x % 2;
|
||||
for(int j = 0; j < ITER_NUM; j++)
|
||||
{
|
||||
if(tid_even == 0)
|
||||
{
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
asm volatile("v_rcp_f64 %0, %0\n" : "+v"(a), "=s"(i) : "s"(c));
|
||||
}
|
||||
else
|
||||
{
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
asm volatile("v_fmac_f32 %0, %0, %1\n" : "+v"(d) : "v"(e));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ======================================================
|
||||
|
||||
void
|
||||
run_kernel()
|
||||
{
|
||||
for(int i = 1; i <= 64; i++)
|
||||
{
|
||||
if(i % 2 == 1)
|
||||
kernel1<<<BLOCK_SIZE, i>>>(i);
|
||||
else
|
||||
kernel2<<<BLOCK_SIZE, i>>>(i);
|
||||
|
||||
check_hip_error();
|
||||
HIP_API_CALL(hipDeviceSynchronize());
|
||||
}
|
||||
|
||||
float arg = 0;
|
||||
kernel3<<<BLOCK_SIZE, 4 * 64>>>(arg);
|
||||
check_hip_error();
|
||||
HIP_API_CALL(hipDeviceSynchronize());
|
||||
}
|
||||
|
||||
int
|
||||
main()
|
||||
{
|
||||
run_kernel();
|
||||
return 0;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
void
|
||||
check_hip_error(void)
|
||||
{
|
||||
hipError_t err = hipGetLastError();
|
||||
if(err != hipSuccess)
|
||||
{
|
||||
auto_lock_t _lk{print_lock};
|
||||
std::cerr << "Error: " << hipGetErrorString(err) << std::endl;
|
||||
throw std::runtime_error("hip_api_call");
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
@@ -34,3 +34,4 @@ add_subdirectory(aborted-app)
|
||||
add_subdirectory(summary)
|
||||
add_subdirectory(roctracer-roctx)
|
||||
add_subdirectory(scratch-memory)
|
||||
add_subdirectory(pc-sampling)
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
#
|
||||
# PC sampling tests
|
||||
#
|
||||
|
||||
add_subdirectory(host-trap)
|
||||
@@ -0,0 +1,5 @@
|
||||
#
|
||||
# PC sampling tests
|
||||
#
|
||||
|
||||
add_subdirectory(exec-mask-manipulation)
|
||||
+150
@@ -0,0 +1,150 @@
|
||||
#
|
||||
# rocprofv3 tool test
|
||||
#
|
||||
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
|
||||
|
||||
project(
|
||||
rocprofiler-tests-pc-sampling
|
||||
LANGUAGES CXX
|
||||
VERSION 0.0.0)
|
||||
|
||||
find_package(rocprofiler-sdk REQUIRED)
|
||||
|
||||
rocprofiler_configure_pytest_files(CONFIG pytest.ini COPY validate.py conftest.py
|
||||
input.json input.yml)
|
||||
|
||||
add_test(
|
||||
NAME rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-cmd-execute
|
||||
COMMAND
|
||||
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> --pc-sampling-unit time
|
||||
--pc-sampling-method host_trap --pc-sampling-interval 1 -d
|
||||
${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_cmd_input -o out --output-format csv json
|
||||
-- $<TARGET_FILE:exec-mask-manipulation>)
|
||||
|
||||
string(REPLACE "LD_PRELOAD=" "ROCPROF_PRELOAD=" PRELOAD_ENV
|
||||
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV}")
|
||||
|
||||
set(cc-env-pc-sampling "${PRELOAD_ENV}")
|
||||
|
||||
set_tests_properties(
|
||||
rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-cmd-execute
|
||||
PROPERTIES TIMEOUT
|
||||
45
|
||||
LABELS
|
||||
"integration-tests;pc-sampling"
|
||||
ENVIRONMENT
|
||||
"${cc-env-pc-sampling}"
|
||||
FAIL_REGULAR_EXPRESSION
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}"
|
||||
SKIP_REGULAR_EXPRESSION
|
||||
"PC sampling unavailable")
|
||||
|
||||
add_test(
|
||||
NAME rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-json-execute
|
||||
COMMAND
|
||||
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> -i
|
||||
${CMAKE_CURRENT_BINARY_DIR}/input.json -d
|
||||
${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_json_input --
|
||||
$<TARGET_FILE:exec-mask-manipulation>)
|
||||
|
||||
set_tests_properties(
|
||||
rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-json-execute
|
||||
PROPERTIES TIMEOUT
|
||||
45
|
||||
LABELS
|
||||
"integration-tests;pc-sampling"
|
||||
ENVIRONMENT
|
||||
"${cc-env-pc-sampling}"
|
||||
FAIL_REGULAR_EXPRESSION
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}"
|
||||
SKIP_REGULAR_EXPRESSION
|
||||
"PC sampling unavailable")
|
||||
|
||||
add_test(
|
||||
NAME rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-yaml-execute
|
||||
COMMAND
|
||||
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> -i
|
||||
${CMAKE_CURRENT_BINARY_DIR}/input.yml -d
|
||||
${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_yaml_input -o out --output-format csv
|
||||
json -- $<TARGET_FILE:exec-mask-manipulation>)
|
||||
|
||||
set_tests_properties(
|
||||
rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-yaml-execute
|
||||
PROPERTIES TIMEOUT
|
||||
45
|
||||
LABELS
|
||||
"integration-tests;pc-sampling"
|
||||
ENVIRONMENT
|
||||
"${cc-env-pc-sampling}"
|
||||
FAIL_REGULAR_EXPRESSION
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}"
|
||||
SKIP_REGULAR_EXPRESSION
|
||||
"PC sampling unavailable")
|
||||
|
||||
# ========================= Validation tests
|
||||
|
||||
add_test(
|
||||
NAME rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-cmd-validate
|
||||
COMMAND
|
||||
${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py -k
|
||||
test_validate_pc_sampling_exec_mask_manipulation_ --input-csv
|
||||
${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_cmd_input/out_pc_sampling_host_trap.csv
|
||||
--input-json ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_cmd_input/out_results.json
|
||||
--all-sampled False)
|
||||
|
||||
set_tests_properties(
|
||||
rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-cmd-validate
|
||||
PROPERTIES TIMEOUT
|
||||
60
|
||||
LABELS
|
||||
"integration-tests;pc-sampling"
|
||||
DEPENDS
|
||||
"rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-cmd-execute"
|
||||
FAIL_REGULAR_EXPRESSION
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}"
|
||||
SKIP_REGULAR_EXPRESSION
|
||||
"PC sampling unavailable")
|
||||
|
||||
add_test(
|
||||
NAME rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-json-validate
|
||||
COMMAND
|
||||
${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py -k
|
||||
test_validate_pc_sampling_exec_mask_manipulation_ --input-csv
|
||||
${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_json_input/out_pc_sampling_host_trap.csv
|
||||
--input-json ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_json_input/out_results.json
|
||||
--all-sampled False)
|
||||
|
||||
set_tests_properties(
|
||||
rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-json-validate
|
||||
PROPERTIES TIMEOUT
|
||||
60
|
||||
LABELS
|
||||
"integration-tests;pc-sampling"
|
||||
DEPENDS
|
||||
"rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-json-execute"
|
||||
FAIL_REGULAR_EXPRESSION
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}"
|
||||
SKIP_REGULAR_EXPRESSION
|
||||
"PC sampling unavailable")
|
||||
|
||||
add_test(
|
||||
NAME rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-yaml-validate
|
||||
COMMAND
|
||||
${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py -k
|
||||
test_validate_pc_sampling_exec_mask_manipulation_ --input-csv
|
||||
${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_yaml_input/out_pc_sampling_host_trap.csv
|
||||
--input-json ${CMAKE_CURRENT_BINARY_DIR}/pc_sampling_yaml_input/out_results.json
|
||||
--all-sampled False)
|
||||
|
||||
set_tests_properties(
|
||||
rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-yaml-validate
|
||||
PROPERTIES TIMEOUT
|
||||
60
|
||||
LABELS
|
||||
"integration-tests;pc-sampling"
|
||||
DEPENDS
|
||||
"rocprofv3-test-cc-pc-sampling-exec-mask-manipulation-input-yaml-execute"
|
||||
FAIL_REGULAR_EXPRESSION
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}"
|
||||
SKIP_REGULAR_EXPRESSION
|
||||
"PC sampling unavailable")
|
||||
+67
@@ -0,0 +1,67 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import json
|
||||
import os
|
||||
import pytest
|
||||
import pandas as pd
|
||||
|
||||
from rocprofiler_sdk.pytest_utils.dotdict import dotdict
|
||||
from rocprofiler_sdk.pytest_utils import collapse_dict_list
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption(
|
||||
"--input-csv",
|
||||
action="store",
|
||||
help="Path to CSV file.",
|
||||
)
|
||||
|
||||
parser.addoption(
|
||||
"--input-json",
|
||||
action="store",
|
||||
help="Path to CSV file.",
|
||||
)
|
||||
|
||||
parser.addoption(
|
||||
"--all-sampled",
|
||||
action="store",
|
||||
help="All SW and HW units must be sampled.",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def input_csv(request):
|
||||
filename = request.config.getoption("--input-csv")
|
||||
if not os.path.isfile(filename):
|
||||
# The CSV file is not generated, because the dependency test
|
||||
# responsible to generate this file was skipped or failed.
|
||||
# Thus emit the message to skip this test as well.
|
||||
print("PC sampling unavailable")
|
||||
else:
|
||||
with open(filename, "r") as inp:
|
||||
return pd.read_csv(
|
||||
inp,
|
||||
na_filter=False, # parse empty fields as ""
|
||||
keep_default_na=False, # parse empty fields as ""
|
||||
dtype={
|
||||
"Exec_Mask": "uint64",
|
||||
"Instruction": str,
|
||||
"Instruction_Comment": str,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def input_json(request):
|
||||
filename = request.config.getoption("--input-json")
|
||||
with open(filename, "r") as inp:
|
||||
# Significant overhead of 5-6secs observed when feeding
|
||||
# data into the dotdict.
|
||||
# Using plain python dict instead
|
||||
return collapse_dict_list(json.load(inp))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def all_sampled(request):
|
||||
_all_sampled_str = request.config.getoption("--all-sampled")
|
||||
return _all_sampled_str == "True"
|
||||
+14
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"jobs": [
|
||||
{
|
||||
"pc_sampling_unit": "time",
|
||||
"pc_sampling_method": "host_trap",
|
||||
"pc_sampling_interval": 1,
|
||||
"output_file": "out",
|
||||
"output_format": [
|
||||
"csv",
|
||||
"json"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,4 @@
|
||||
jobs:
|
||||
- pc_sampling_unit: "time"
|
||||
pc_sampling_method: "host_trap"
|
||||
pc_sampling_interval: 1
|
||||
@@ -0,0 +1,5 @@
|
||||
|
||||
[pytest]
|
||||
addopts = --durations=20 -rA -s -vv
|
||||
testpaths = validate.py
|
||||
pythonpath = @ROCPROFILER_SDK_TESTS_BINARY_DIR@/pytest-packages
|
||||
+376
@@ -0,0 +1,376 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import itertools
|
||||
import sys
|
||||
import pytest
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
# =========================== Validating CSV output
|
||||
|
||||
|
||||
# Keep this in case we decide to revert workgroup_id information
|
||||
def validate_workgoup_id_x_y_z(df, max_x, max_y, max_z):
|
||||
assert (df["Workgroup_Size_X"].astype(int) >= 0).all()
|
||||
assert (df["Workgroup_Size_X"].astype(int) <= max_x).all()
|
||||
|
||||
assert (df["Workgroup_Size_Y"].astype(int) >= 0).all()
|
||||
assert (df["Workgroup_Size_Y"].astype(int) <= max_y).all()
|
||||
|
||||
assert (df["Workgroup_Size_Z"].astype(int) >= 0).all()
|
||||
assert (df["Workgroup_Size_Z"].astype(int) <= max_z).all()
|
||||
|
||||
|
||||
# Keep this in case we decide to revert wave_id information
|
||||
def validate_wave_id(df, max_wave_id):
|
||||
assert (df["Wave_Id"].astype(int) <= max_wave_id).all()
|
||||
|
||||
|
||||
# Keep this in case we decide to revert wave_id information
|
||||
def validate_chiplet(df, max_chiplet):
|
||||
assert (df["Chiplet"].astype(int) <= max_chiplet).all()
|
||||
|
||||
|
||||
def validate_instruction_decoding(
|
||||
df,
|
||||
inst_str,
|
||||
exec_mask_uint64: np.uint64 = None,
|
||||
source_code_lines_range: (int, int) = None,
|
||||
all_source_lines_samples=False,
|
||||
):
|
||||
# Make a copy, so that we don't work (modify) a view.
|
||||
df_inst = df[df["Instruction"].apply(lambda inst: inst.startswith(inst_str))].copy()
|
||||
|
||||
assert not df_inst.empty
|
||||
# assert the exec mask if requested
|
||||
if exec_mask_uint64 is not None:
|
||||
assert (df_inst["Exec_Mask"].astype(np.uint64) == exec_mask_uint64).all()
|
||||
|
||||
# assert whether the samples source code lines belongs to the provided range
|
||||
if source_code_lines_range is not None:
|
||||
start_range, end_range = source_code_lines_range
|
||||
# The instruction comment is isually in the following format: /path/to/source/file.cpp:line_num
|
||||
df_inst["source_line_num"] = df_inst["Instruction_Comment"].apply(
|
||||
lambda source_line: int(source_line.split(":")[-1])
|
||||
)
|
||||
assert (df_inst["source_line_num"] >= start_range).all()
|
||||
assert (df_inst["source_line_num"] <= end_range).all()
|
||||
# if requested, check if all lines from the range are sampled
|
||||
if all_source_lines_samples:
|
||||
assert len(df_inst["source_line_num"].unique()) == (
|
||||
end_range - start_range + 1
|
||||
)
|
||||
|
||||
|
||||
def validate_instruction_comment(df):
|
||||
# Instruction comment must always be present, since the testing application
|
||||
# is built with debug symbols.
|
||||
assert (
|
||||
(df["Instruction_Comment"] != "") & (df["Instruction_Comment"] != "nullptr")
|
||||
).all()
|
||||
|
||||
|
||||
def validate_instruction_correlation_id_relation(df):
|
||||
# Samples with no decoded instructions originates from either
|
||||
# blit kernels or self modifying code. The correlation id for this
|
||||
# type of samples should alway be zero.
|
||||
# Thus, Correlation_Id is 0 `iff`` instruction is not decoded.
|
||||
|
||||
# The previous statement has two implications.
|
||||
# Implication 1: If the instruction is not decoded, then correlation id is 0.
|
||||
samples_no_instruction_df = df[
|
||||
(df["Instruction"] == "") | (df["Instruction"] == "nullptr")
|
||||
]
|
||||
assert (samples_no_instruction_df["Correlation_Id"] == 0).all()
|
||||
|
||||
# Implication 2: If the correlation id is 0, then the instruction is not decoded.
|
||||
samples_cid_zero_df = df[df["Correlation_Id"] == 0]
|
||||
assert (
|
||||
(samples_cid_zero_df["Instruction"] == "")
|
||||
| (samples_cid_zero_df["Instruction"] == "nullptr")
|
||||
).all()
|
||||
|
||||
assert len(samples_no_instruction_df) == len(samples_cid_zero_df)
|
||||
|
||||
# Since we're not enabling any kind of API tracing,
|
||||
# internal correlation id should match the dispatch id
|
||||
assert all(df["Correlation_Id"] == df["Dispatch_Id"])
|
||||
|
||||
|
||||
def validate_exec_mask_based_on_correlation_id(df):
|
||||
# The function assumes that each kernel launches 1024 blocks.
|
||||
# Each block contains number of threads that matches correlation ID of the kernel.
|
||||
# The exec mask of a sample should contain number of ones equal to
|
||||
# the correlation ID of the kernel during which execution the sample was generated.
|
||||
df["active_SIMD_threads"] = df["Exec_Mask"].apply(
|
||||
lambda exec_mask: bin(exec_mask).count("1")
|
||||
)
|
||||
assert (df["active_SIMD_threads"] == df["Correlation_Id"]).all()
|
||||
|
||||
# TODO: Comment out the following code if it causes spurious fails.
|
||||
# The more conservative constraint based on the experience follows.
|
||||
# The exec mask of sampled instructions of the kernels respect the following pattern:
|
||||
# cid -> exec
|
||||
# 1 -> 0b1
|
||||
# 2 -> 0b11
|
||||
# 3 -> 0b111
|
||||
# ...
|
||||
# 64 -> 0xffffffffffffffff
|
||||
|
||||
df["Exec_Mask2"] = (
|
||||
df["Correlation_Id"].astype(int).apply(lambda x: int("0b" + (x * "1"), 2))
|
||||
)
|
||||
|
||||
# TODO: exec should be in hex and that will ease the comparison
|
||||
assert (df["Exec_Mask"].astype(np.uint64) == df["Exec_Mask2"].astype(np.uint64)).all()
|
||||
|
||||
|
||||
def exec_mask_manipulation_validate_csv(df, all_sampled=False):
|
||||
assert not df.empty
|
||||
|
||||
validate_instruction_comment(df)
|
||||
validate_instruction_correlation_id_relation(df)
|
||||
|
||||
# Validate samples with non-zero correlation IDs (and with decoded instructions)
|
||||
samples_cid_non_zero_df = df[df["Correlation_Id"] != 0]
|
||||
|
||||
# exactly 65 kernels and 65 correlation id
|
||||
assert (samples_cid_non_zero_df["Correlation_Id"].astype(int) >= 1).all()
|
||||
assert (samples_cid_non_zero_df["Correlation_Id"].astype(int) <= 65).all()
|
||||
if all_sampled:
|
||||
# all correlation IDs must be sampled
|
||||
assert len(samples_cid_non_zero_df["Correlation_Id"].astype(int).unique()) == 65
|
||||
|
||||
first_64_kernels_df = samples_cid_non_zero_df[
|
||||
samples_cid_non_zero_df["Correlation_Id"] <= 64
|
||||
]
|
||||
|
||||
# Make a copy, so that we don't work (modify) a view.
|
||||
validate_exec_mask_based_on_correlation_id(first_64_kernels_df.copy())
|
||||
|
||||
# validate the last kernel
|
||||
kernel_65_df = df[df["Correlation_Id"] == 65]
|
||||
|
||||
# assert that v_rcp instructions are properly decoded
|
||||
# the v_rcp is executed by even SIMD threads
|
||||
validate_instruction_decoding(
|
||||
kernel_65_df,
|
||||
"v_rcp_f64",
|
||||
exec_mask_uint64=np.uint64(int("5555555555555555", 16)),
|
||||
source_code_lines_range=(288, 387),
|
||||
all_source_lines_samples=all_sampled,
|
||||
)
|
||||
|
||||
# assert that v_fmac_f32 instructions are properly decoded
|
||||
# the v_fmac_f32 is executed by odd SIMD threads
|
||||
validate_instruction_decoding(
|
||||
kernel_65_df,
|
||||
"v_fmac_f32",
|
||||
exec_mask_uint64=np.uint64(int("AAAAAAAAAAAAAAAA", 16)),
|
||||
source_code_lines_range=(391, 490),
|
||||
all_source_lines_samples=all_sampled,
|
||||
)
|
||||
|
||||
|
||||
def test_validate_pc_sampling_exec_mask_manipulation_csv(
|
||||
input_csv: pd.DataFrame, all_sampled: bool
|
||||
):
|
||||
exec_mask_manipulation_validate_csv(input_csv, all_sampled=all_sampled)
|
||||
|
||||
|
||||
# ========================= Validating JSON output
|
||||
|
||||
|
||||
def validate_json_exec_mask_manipulation(data_json, all_sampled=False):
|
||||
# Although functional programming might look more elegant,
|
||||
# I was trying to avoid multiple iteration over the list of samples.
|
||||
# Thus, I decided to use procedural programming instead.
|
||||
# Although, it would be more elegant to wrap some of the checks in dedicated functions,
|
||||
# I noticed that it can introduce significant overhead, so I decided to inline those checks.
|
||||
|
||||
# the function assume homogenous system
|
||||
agents = data_json["agents"]
|
||||
gpu_agents = list(filter(lambda agent: agent["type"] == 2, agents))
|
||||
# There should be at least one GPU agent
|
||||
assert len(gpu_agents) > 0
|
||||
first_gpu_agent = gpu_agents[0]
|
||||
num_xcc = first_gpu_agent["num_xcc"]
|
||||
max_waves_per_simd = first_gpu_agent["max_waves_per_simd"]
|
||||
simd_per_cu = first_gpu_agent["simd_per_cu"]
|
||||
|
||||
instructions = data_json["strings"]["pc_sample_instructions"]
|
||||
comments = data_json["strings"]["pc_sample_comments"]
|
||||
|
||||
# execution mask where even SIMD lanes are active
|
||||
# correspond to the v_rcp_f64 instructions of the last kernel
|
||||
even_simds_active_exec_mask = np.uint64(int("5555555555555555", 16))
|
||||
# start and end source code lines of the v_rcp_f64 instructions of the last kernel
|
||||
v_rcp_f64_start_line_num, v_rcp_f64_end_line_num = 288, 387
|
||||
# execution mask where even SIMD lanes are active
|
||||
# correspond to the v_rcp_f64 instructions of the last kernel
|
||||
odd_simds_active_exec_mask = np.uint64(int("AAAAAAAAAAAAAAAA", 16))
|
||||
# start and end source code lines of the v_fmac_f32 0 instructions of the last kernel
|
||||
v_fmac_f32_start_line_num, v_fmac_f32_end_line_num = 391, 490
|
||||
|
||||
# sampled wave_ids of the last kernel
|
||||
kernel65_sampled_wave_in_grp = set()
|
||||
# sampled source lines of the last kernel matching v_rcp_f64 instructions
|
||||
kernel65_v_rcp_64_sampled_source_line_set = set()
|
||||
# sampled source lines of the last kernel matching v_rcp_f64 instructions
|
||||
kernel65_v_fmac_f32_sampled_source_line_set = set()
|
||||
# sampled correlation IDs
|
||||
sampled_cids_set = set()
|
||||
# pairs of sampled SIMD ids and waveslot IDs
|
||||
sampled_simd_waveslots_pairs = set()
|
||||
# sampled chiplets
|
||||
sampled_chiplets = set()
|
||||
# sample VMIDs
|
||||
sampled_vmids = set()
|
||||
|
||||
for sample in data_json["buffer_records"]["pc_sample_host_trap"]:
|
||||
record = sample["record"]
|
||||
cid = record["corr_id"]["internal"]
|
||||
|
||||
# pull information from hw_id
|
||||
hw_id = record["hw_id"]
|
||||
sampled_chiplets.add(hw_id["chiplet"])
|
||||
sampled_simd_waveslots_pairs.add((hw_id["simd_id"], hw_id["wave_id"]))
|
||||
sampled_vmids.add(hw_id["vm_id"])
|
||||
|
||||
# Checks specific for all samples
|
||||
|
||||
# cids must be non-negative numbers
|
||||
assert cid >= 0
|
||||
|
||||
inst_index = sample["inst_index"]
|
||||
|
||||
# Since we're not enabling any kind of API tracing, the internal correlation id should
|
||||
# be equal to the dispatch_id
|
||||
assert cid == record["dispatch_id"]
|
||||
|
||||
if cid == 0:
|
||||
# Samples originates either from a blit kernel or self-modifying code.
|
||||
# Thus, code object is uknown, as well as the instruction.
|
||||
assert record["pc"]["code_object_id"] == 0
|
||||
assert inst_index == -1
|
||||
else:
|
||||
# Update set of sampled cids
|
||||
sampled_cids_set.add(cid)
|
||||
|
||||
# All samples with non-zero correlation ID should pass the following checks
|
||||
# code object is know, so as the instruction
|
||||
assert record["pc"]["code_object_id"] != 0
|
||||
assert inst_index != -1
|
||||
|
||||
wgid = record["wrkgrp_id"]
|
||||
# check corrdinates of the workgroup
|
||||
assert wgid["x"] >= 0 and wgid["x"] <= 1023
|
||||
assert wgid["y"] == 0
|
||||
assert wgid["z"] == 0
|
||||
|
||||
wave_in_grp = record["wave_in_grp"]
|
||||
exec_mask = record["exec_mask"]
|
||||
|
||||
if cid < 65:
|
||||
# checks specific for samples from first 64 kernels
|
||||
assert wave_in_grp == 0
|
||||
# inline if possible
|
||||
# validate_json_exec_mask_based_on_cid(sample.record)
|
||||
|
||||
# The function assumes that each kernel launches 1024 blocks.
|
||||
# Each block contains number of threads that matches correlation ID of the kernel.
|
||||
# The exec mask of a sample should contain number of ones equal to
|
||||
# the correlation ID of the kernel during which execution the sample was generated.
|
||||
assert bin(exec_mask).count("1") == cid
|
||||
|
||||
# TODO: Comment out the following code if it causes spurious fails.
|
||||
# The more conservative constraint based on the experience follows.
|
||||
# The exec mask of sampled instructions of the kernels respect the following pattern:
|
||||
# cid -> exec
|
||||
# 1 -> 0b1
|
||||
# 2 -> 0b11
|
||||
# 3 -> 0b111
|
||||
# ...
|
||||
# 64 -> 0xffffffffffffffff
|
||||
exec_mask_str = "0b" + "1" * cid
|
||||
assert np.uint64(exec_mask) == np.uint64(int(exec_mask_str, 2))
|
||||
else:
|
||||
# No more that 65 cids
|
||||
assert cid == 65
|
||||
# Monitor wave_in_group being sampled
|
||||
kernel65_sampled_wave_in_grp.add(wave_in_grp)
|
||||
# chekcs specific for samples from the last kernel
|
||||
assert wave_in_grp >= 0 and wave_in_grp <= 3
|
||||
|
||||
# validate instruction decoding
|
||||
inst = instructions[inst_index]
|
||||
comm = comments[inst_index]
|
||||
# The instruction comment is isually in the following format:
|
||||
# /path/to/source/file.cpp:line_num
|
||||
line_num = int(comm.split(":")[-1])
|
||||
if inst.startswith("v_rcp_f64"):
|
||||
# even SIMD lanes active
|
||||
assert np.uint64(exec_mask) == even_simds_active_exec_mask
|
||||
assert (
|
||||
line_num >= v_rcp_f64_start_line_num
|
||||
and line_num <= v_rcp_f64_end_line_num
|
||||
)
|
||||
kernel65_v_rcp_64_sampled_source_line_set.add(line_num)
|
||||
elif inst.startswith("v_fmac_f32"):
|
||||
# odd SIMD lanes active
|
||||
assert np.uint64(exec_mask) == odd_simds_active_exec_mask
|
||||
assert (
|
||||
line_num >= v_fmac_f32_start_line_num
|
||||
and line_num <= v_fmac_f32_end_line_num
|
||||
)
|
||||
kernel65_v_fmac_f32_sampled_source_line_set.add(line_num)
|
||||
|
||||
if all_sampled:
|
||||
# All cids that belongs to the range [1, 65] should be samples
|
||||
assert len(sampled_cids_set) == 65
|
||||
|
||||
# all wave_ids that belongs to the range [0, 3] should be sampled for the last kernel
|
||||
assert len(kernel65_sampled_wave_in_grp) == 4
|
||||
|
||||
# all source lines matches v_rcp_f64 instructions of the last kernel should be sampled
|
||||
assert len(kernel65_v_rcp_64_sampled_source_line_set) == (
|
||||
v_rcp_f64_end_line_num - v_rcp_f64_start_line_num + 1
|
||||
)
|
||||
# all source lines matches v_fmac_f32 instructions of the last kernel should be sampled
|
||||
assert len(kernel65_v_fmac_f32_sampled_source_line_set) == (
|
||||
v_fmac_f32_end_line_num - v_fmac_f32_start_line_num + 1
|
||||
)
|
||||
|
||||
# all chiplets must be sampled
|
||||
assert len(sampled_chiplets) == num_xcc
|
||||
# all (simd ID, waveslot ID) pairs must be samples
|
||||
assert len(sampled_simd_waveslots_pairs) == simd_per_cu * max_waves_per_simd
|
||||
|
||||
# assert chiplet index
|
||||
assert all(map(lambda chiplet: 0 <= chiplet < num_xcc, sampled_chiplets))
|
||||
# assert (SIMD ID, waveslot ID) combinations
|
||||
assert all(
|
||||
map(
|
||||
lambda simd_waveslot: (0 <= simd_waveslot[0] < simd_per_cu)
|
||||
and (0 <= simd_waveslot[1] < max_waves_per_simd),
|
||||
sampled_simd_waveslots_pairs,
|
||||
)
|
||||
)
|
||||
# all samples should belong to the same VMID
|
||||
assert len(sampled_vmids) == 1
|
||||
|
||||
|
||||
def test_validate_pc_sampling_exec_mask_manipulation_json(
|
||||
input_json, input_csv: pd.DataFrame, all_sampled: bool
|
||||
):
|
||||
data = input_json["rocprofiler-sdk-tool"]
|
||||
# The same amount of samples should be in both CSV and JSON files.
|
||||
assert len(input_csv) == len(data["buffer_records"]["pc_sample_host_trap"])
|
||||
# # validating JSON output
|
||||
validate_json_exec_mask_manipulation(data, all_sampled=all_sampled)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit_code = pytest.main(["-x", __file__] + sys.argv[1:])
|
||||
sys.exit(exit_code)
|
||||
مرجع در شماره جدید
Block a user