Put cached perfetto traces as default one (#2138)
* Put cached perfetto traces as default one * Improve cached data and perfetto traces in order to be more aligned with E2E tests * Addressing PR comments and findings * Force early instrumentation bundle instantiation * Sync-up insturumented containers with thread growth data * Revert ompvv number of host threads to default 8 * Fixed counter track namings for amd-smi * AIPROFSYST-34 [rocprof-sys] Update documentation describing newly introduced changes to default tracing mechanism
This commit is contained in:
@@ -4,6 +4,25 @@
|
||||
|
||||
Full documentation for ROCm Systems Profiler is available at [https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/](https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/).
|
||||
|
||||
## ROCm Systems Profiler 1.4.0 for ROCm x.y.z (unreleased)
|
||||
|
||||
### Added
|
||||
|
||||
- Documentation for `ROCPROFSYS_TRACE_CACHED` configuration option and its performance benefits.
|
||||
- Documentation for `--trace-legacy` / `-L` CLI flag for direct tracing mode.
|
||||
|
||||
### Changed
|
||||
|
||||
- `ROCPROFSYS_TRACE_CACHED` is now the default perfetto tracing mode for improved performance.
|
||||
- Renamed `ROCPROFSYS_TRACE` to `ROCPROFSYS_TRACE_LEGACY` (with backward compatibility).
|
||||
- `--trace` / `-T` CLI flag now uses cached mode by default.
|
||||
- Added `--trace-legacy` / `-L` CLI flag for direct tracing mode.
|
||||
|
||||
### Deprecated
|
||||
|
||||
- `ROCPROFSYS_TRACE` environment variable (use `ROCPROFSYS_TRACE_LEGACY` for direct mode).
|
||||
- `ROCPROFSYS_USE_PERFETTO` environment variable (use `ROCPROFSYS_TRACE_LEGACY`).
|
||||
|
||||
## ROCm Systems Profiler 1.3.0 for ROCm 7.2.0
|
||||
|
||||
### Added
|
||||
|
||||
@@ -176,7 +176,17 @@ Primary collection modes
|
||||
Trace mode (default)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Tracing mode generates comprehensive, deterministic traces of every event and measurement during application execution. This mode can be enabled using ``ROCPROFSYS_TRACE=true`` or ``ROCPROFSYS_MODE=trace`` setting.
|
||||
Tracing mode generates comprehensive, deterministic traces of every event and measurement during application execution. This mode can be enabled using ``ROCPROFSYS_MODE=trace`` or by enabling one of the trace backend options.
|
||||
|
||||
ROCm Systems Profiler provides two trace backend modes:
|
||||
|
||||
- **Cached Mode (default, recommended)**: ``ROCPROFSYS_TRACE_CACHED=true`` or ``--trace`` / ``-T`` enables deferred trace generation with minimal runtime overhead. Trace data is buffered during execution and written after the application completes, significantly reducing performance impact during profiling.
|
||||
|
||||
- **Legacy Mode**: ``ROCPROFSYS_TRACE_LEGACY=true`` or ``--trace-legacy`` / ``-L`` enables direct mode where trace data is written immediately during execution. This mode provides real-time trace generation but has higher runtime overhead compared to cached mode.
|
||||
|
||||
.. note::
|
||||
|
||||
The ``ROCPROFSYS_TRACE`` environment variable is deprecated and has been renamed to ``ROCPROFSYS_TRACE_LEGACY``. For new workflows, use ``ROCPROFSYS_TRACE_CACHED`` (default) or ``ROCPROFSYS_TRACE_LEGACY`` explicitly.
|
||||
|
||||
Additional configuration options to control the tracing behavior include:
|
||||
|
||||
@@ -263,4 +273,4 @@ Granularity options:
|
||||
- Function-level: ``--coverage=function`` (``CODECOV_FUNCTION``)
|
||||
- Basic block-level: ``--coverage=basic_block`` (``CODECOV_BASIC_BLOCK``)
|
||||
|
||||
.. note:: Coverage mode disables several other features and all other modes to reduce overhead.
|
||||
.. note:: Coverage mode disables several other features and all other modes to reduce overhead.
|
||||
|
||||
@@ -34,7 +34,8 @@ and tweak the default sampling values.
|
||||
.. code-block:: shell
|
||||
|
||||
# ...
|
||||
ROCPROFSYS_TRACE = true
|
||||
ROCPROFSYS_TRACE_CACHED = true # Recommended: deferred trace generation for minimal overhead
|
||||
# ROCPROFSYS_TRACE_LEGACY = false # Alternative: direct mode with higher overhead
|
||||
ROCPROFSYS_PROFILE = true
|
||||
ROCPROFSYS_USE_SAMPLING = true
|
||||
ROCPROFSYS_USE_PROCESS_SAMPLING = true
|
||||
@@ -339,7 +340,8 @@ Generating a default configuration file
|
||||
|
||||
ROCPROFSYS_CONFIG_FILE =
|
||||
ROCPROFSYS_MODE = trace
|
||||
ROCPROFSYS_TRACE = true
|
||||
ROCPROFSYS_TRACE_CACHED = true
|
||||
ROCPROFSYS_TRACE_LEGACY = false
|
||||
ROCPROFSYS_PROFILE = false
|
||||
ROCPROFSYS_USE_SAMPLING = false
|
||||
ROCPROFSYS_USE_PROCESS_SAMPLING = true
|
||||
@@ -497,7 +499,9 @@ Viewing the setting descriptions
|
||||
| ROCPROFSYS_USE_CODE_COVERAGE | Enable support for code coverage |
|
||||
| ROCPROFSYS_USE_KOKKOSP | Enable support for Kokkos Tools |
|
||||
| ROCPROFSYS_USE_OMPT | Enable support for OpenMP-Tools |
|
||||
| ROCPROFSYS_TRACE | Enable perfetto backend |
|
||||
| ROCPROFSYS_TRACE_CACHED | Enable perfetto backend with deferred...|
|
||||
| ROCPROFSYS_TRACE_LEGACY | Enable perfetto backend (legacy, dir... |
|
||||
| ROCPROFSYS_TRACE | [DEPRECATED] Renamed to ROCPROFSYS_T... |
|
||||
| ROCPROFSYS_USE_PID | Enable tagging filenames with proces... |
|
||||
| ROCPROFSYS_USE_AMD_SMI | Enable sampling GPU power, temp, uti... |
|
||||
| ROCPROFSYS_USE_ROCM | Enable ROCM tracing |
|
||||
@@ -1345,7 +1349,8 @@ but do not override an existing value for the environment variable.
|
||||
$SAMPLE = OFF
|
||||
|
||||
# use fields
|
||||
ROCPROFSYS_TRACE = $ENABLE
|
||||
ROCPROFSYS_TRACE_CACHED = $ENABLE # Recommended: deferred trace generation
|
||||
ROCPROFSYS_TRACE_LEGACY = OFF # Legacy direct mode (higher overhead)
|
||||
ROCPROFSYS_PROFILE = $ENABLE
|
||||
ROCPROFSYS_USE_SAMPLING = $SAMPLE
|
||||
ROCPROFSYS_USE_PROCESS_SAMPLING = $SAMPLE
|
||||
|
||||
@@ -75,7 +75,7 @@ The configuration parameter settings can be saved in a configuration file. Here
|
||||
ROCPROFSYS_SAMPLING_DELAY=0.05
|
||||
ROCPROFSYS_SAMPLING_CPUS=0-9
|
||||
ROCPROFSYS_SAMPLING_GPUS=$env:HIP_VISIBLE_DEVICES
|
||||
ROCPROFSYS_TRACE=ON
|
||||
ROCPROFSYS_TRACE_CACHED=ON
|
||||
ROCPROFSYS_PROFILE=ON
|
||||
ROCPROFSYS_USE_SAMPLING=ON
|
||||
ROCPROFSYS_USE_PROCESS_SAMPLING=OFF
|
||||
|
||||
@@ -284,7 +284,7 @@ The following snippets show how ``rocprof-sys-sample`` runs with various environ
|
||||
LD_PRELOAD=/opt/rocprofiler-systems/lib/librocprof-sys-dl.so.1.7.1
|
||||
ROCPROFSYS_CPU_FREQ_ENABLED=false
|
||||
ROCPROFSYS_PROFILE=true
|
||||
ROCPROFSYS_TRACE=true
|
||||
ROCPROFSYS_TRACE_CACHED=true
|
||||
ROCPROFSYS_USE_AMD_SMI=true
|
||||
ROCPROFSYS_USE_PROCESS_SAMPLING=true
|
||||
ROCPROFSYS_USE_SAMPLING=true
|
||||
@@ -307,7 +307,7 @@ The following snippets show how ``rocprof-sys-sample`` runs with various environ
|
||||
ROCPROFSYS_USE_KOKKOSP=true
|
||||
ROCPROFSYS_USE_MPIP=true
|
||||
ROCPROFSYS_USE_OMPT=true
|
||||
ROCPROFSYS_TRACE=true
|
||||
ROCPROFSYS_TRACE_CACHED=true
|
||||
ROCPROFSYS_USE_PROCESS_SAMPLING=true
|
||||
ROCPROFSYS_USE_RCCLP=true
|
||||
ROCPROFSYS_USE_AMD_SMI=true
|
||||
@@ -337,7 +337,7 @@ The following snippets show how ``rocprof-sys-sample`` runs with various environ
|
||||
ROCPROFSYS_USE_KOKKOSP=false
|
||||
ROCPROFSYS_USE_MPIP=false
|
||||
ROCPROFSYS_USE_OMPT=false
|
||||
ROCPROFSYS_TRACE=true
|
||||
ROCPROFSYS_TRACE_CACHED=true
|
||||
ROCPROFSYS_USE_PROCESS_SAMPLING=true
|
||||
ROCPROFSYS_USE_RCCLP=false
|
||||
ROCPROFSYS_USE_AMD_SMI=false
|
||||
@@ -362,7 +362,7 @@ Here is the full output from the previous
|
||||
ROCPROFSYS_OUTPUT_PATH=rocprof-sys-output
|
||||
ROCPROFSYS_OUTPUT_PREFIX=%tag%
|
||||
ROCPROFSYS_PROFILE=true
|
||||
ROCPROFSYS_TRACE=true
|
||||
ROCPROFSYS_TRACE_CACHED=true
|
||||
ROCPROFSYS_TRACE_THREAD_LOCKS=false
|
||||
ROCPROFSYS_TRACE_THREAD_RW_LOCKS=false
|
||||
ROCPROFSYS_TRACE_THREAD_SPIN_LOCKS=false
|
||||
|
||||
@@ -17,7 +17,7 @@ For example, starting with the following base configuration:
|
||||
export ROCPROFSYS_TIME_OUTPUT=ON
|
||||
export ROCPROFSYS_USE_PID=OFF
|
||||
export ROCPROFSYS_PROFILE=ON
|
||||
export ROCPROFSYS_TRACE=ON
|
||||
export ROCPROFSYS_TRACE_CACHED=ON
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
|
||||
@@ -192,7 +192,7 @@ First, instrument and run the program.
|
||||
ROCPROFSYS: LD_PRELOAD=/home/rocm-dev/code/rocprofiler-systems/build/ubuntu/22.04/lib/librocprof-sys-dl.so.1.0.0
|
||||
ROCPROFSYS: OMP_TOOL_LIBRARIES=/home/rocm-dev/code/rocprofiler-systems/build/ubuntu/22.04/lib/librocprof-sys-dl.so.1.0.0
|
||||
ROCPROFSYS: ROCPROFSYS_PROFILE=true
|
||||
ROCPROFSYS: ROCPROFSYS_TRACE=true
|
||||
ROCPROFSYS: ROCPROFSYS_TRACE_CACHED=true
|
||||
ROCPROFSYS: ROCPROFSYS_VERBOSE=0
|
||||
[rocprof-sys][dl][1827155] rocprofsys_main
|
||||
[rocprof-sys][1827155][rocprofsys_init_tooling] Instrumentation mode: Trace
|
||||
|
||||
@@ -88,8 +88,7 @@ function(configure_ompvv_tests TEST_TYPE TEST_LIST_VAR)
|
||||
NUM_THREADS_DEVICE=${OMPVV_NUM_THREADS_DEVICE}
|
||||
NUM_TEAMS_DEVICE=${OMPVV_NUM_TEAMS_DEVICE}
|
||||
"FOFFLOADING=${FOFFLOADING_FLAGS}" "FFLAGS=${CUSTOM_FFLAGS}"
|
||||
"FLINKFLAGS=${CUSTOM_FLINKFLAGS}" "SOURCES=${TEST}" compile >
|
||||
${CMAKE_CURRENT_BINARY_DIR}/ompvv-compile-${TARGET_NAME}.log 2>&1
|
||||
"FLINKFLAGS=${CUSTOM_FLINKFLAGS}" "SOURCES=${TEST}" compile
|
||||
COMMAND
|
||||
${CMAKE_COMMAND} -E copy_if_different
|
||||
"${ROCPROFSYS_OMPVV_SOURCE_DIR}/bin/${TEST_NAME}.F90.o"
|
||||
@@ -223,7 +222,7 @@ endif()
|
||||
|
||||
set(OMPVV_OPENMP_VERSION 5.0)
|
||||
set(OMPVV_TDIR "tests/${OMPVV_OPENMP_VERSION}")
|
||||
set(OMPVV_NUM_THREADS_HOST 128) # Smallest possible value of ROCPROFSYS_THREAD_COUNT. Avoids very long test times
|
||||
set(OMPVV_NUM_THREADS_HOST 8) # The default value from ompvv.F90
|
||||
set(OMPVV_NUM_TEAMS_DEVICE 8) # default used by ompvv
|
||||
set(OMPVV_NUM_THREADS_DEVICE 8) # default used by ompvv
|
||||
|
||||
|
||||
@@ -340,10 +340,11 @@ generate_config(std::string _config_file, const std::set<std::string>& _config_f
|
||||
if(_romni && !_lomni) return false;
|
||||
for(const auto* itr :
|
||||
{ "ROCPROFSYS_CONFIG", "ROCPROFSYS_MODE", "ROCPROFSYS_TRACE_CACHED",
|
||||
"ROCPROFSYS_TRACE", "ROCPROFSYS_PROFILE", "ROCPROFSYS_USE_SAMPLING",
|
||||
"ROCPROFSYS_USE_PROCESS_SAMPLING", "ROCPROFSYS_USE_ROCM",
|
||||
"ROCPROFSYS_USE_AMD_SMI", "ROCPROFSYS_USE_KOKKOSP",
|
||||
"ROCPROFSYS_USE_OMPT", "ROCPROFSYS_USE", "ROCPROFSYS_OUTPUT" })
|
||||
"ROCPROFSYS_TRACE_LEGACY", "ROCPROFSYS_PROFILE",
|
||||
"ROCPROFSYS_USE_SAMPLING", "ROCPROFSYS_USE_PROCESS_SAMPLING",
|
||||
"ROCPROFSYS_USE_ROCM", "ROCPROFSYS_USE_AMD_SMI",
|
||||
"ROCPROFSYS_USE_KOKKOSP", "ROCPROFSYS_USE_OMPT", "ROCPROFSYS_USE",
|
||||
"ROCPROFSYS_OUTPUT" })
|
||||
{
|
||||
if(_lhs->get_env_name().find(itr) == 0 &&
|
||||
_rhs->get_env_name().find(itr) != 0)
|
||||
|
||||
@@ -189,7 +189,7 @@ get_initial_environment()
|
||||
update_env(_env, "ROCPROFSYS_TRACE_CACHED", false);
|
||||
update_env(_env, "ROCPROFSYS_PROFILE", false);
|
||||
update_env(_env, "ROCPROFSYS_USE_PROCESS_SAMPLING", false);
|
||||
update_env(_env, "ROCPROFSYS_TRACE", false);
|
||||
update_env(_env, "ROCPROFSYS_TRACE_LEGACY", false);
|
||||
update_env(_env, "ROCPROFSYS_THREAD_POOL_SIZE",
|
||||
get_env<int>("ROCPROFSYS_THREAD_POOL_SIZE", 0));
|
||||
update_env(_env, "ROCPROFSYS_LAUNCHER", "rocprof-sys-causal");
|
||||
|
||||
@@ -339,20 +339,23 @@ parse_args(int argc, char** argv, std::vector<char*>& _env)
|
||||
original_envs);
|
||||
});
|
||||
parser
|
||||
.add_argument({ "-T", "--trace" }, "Generate a detailed trace (perfetto output)")
|
||||
.add_argument(
|
||||
{ "-T", "--trace" },
|
||||
"Generate a detailed trace with deferred trace generation (perfetto output)")
|
||||
.max_count(1)
|
||||
.action([&](parser_t& p) {
|
||||
rocprofsys::common::update_env(_env, "ROCPROFSYS_TRACE", p.get<bool>("trace"),
|
||||
update_mode::REPLACE, ":", updated_envs,
|
||||
original_envs);
|
||||
rocprofsys::common::update_env(_env, "ROCPROFSYS_TRACE_CACHED",
|
||||
p.get<bool>("trace"), update_mode::REPLACE,
|
||||
":", updated_envs, original_envs);
|
||||
});
|
||||
parser
|
||||
.add_argument({ "--trace-cached" },
|
||||
"Generate a detailed trace (perfetto output) from cached data ")
|
||||
.add_argument(
|
||||
{ "-L", "--trace-legacy" },
|
||||
"Generate a detailed trace with direct mode (perfetto output, legacy)")
|
||||
.max_count(1)
|
||||
.action([&](parser_t& p) {
|
||||
rocprofsys::common::update_env(
|
||||
_env, "ROCPROFSYS_TRACE_CACHED", p.get<bool>("trace-cached"),
|
||||
_env, "ROCPROFSYS_TRACE_LEGACY", p.get<bool>("trace-legacy"),
|
||||
update_mode::REPLACE, ":", updated_envs, original_envs);
|
||||
});
|
||||
parser
|
||||
|
||||
@@ -301,14 +301,24 @@ add_core_arguments(parser_t& _parser, parser_data& _data)
|
||||
if(_data.environ_filter("trace", _data))
|
||||
{
|
||||
_parser
|
||||
.add_argument({ "-T", "--trace" },
|
||||
"Generate a detailed trace (perfetto output)")
|
||||
.add_argument({ "-T", "--trace" }, "Generate a detailed trace with deferred "
|
||||
"trace generation (perfetto output)")
|
||||
.max_count(1)
|
||||
.action([&](parser_t& p) {
|
||||
update_env(_data, "ROCPROFSYS_TRACE", p.get<bool>("trace"));
|
||||
update_env(_data, "ROCPROFSYS_TRACE_CACHED", p.get<bool>("trace"));
|
||||
});
|
||||
|
||||
_parser
|
||||
.add_argument(
|
||||
{ "-L", "--trace-legacy" },
|
||||
"Generate a detailed trace with direct mode (perfetto output, legacy)")
|
||||
.max_count(1)
|
||||
.action([&](parser_t& p) {
|
||||
update_env(_data, "ROCPROFSYS_TRACE_LEGACY", p.get<bool>("trace-legacy"));
|
||||
});
|
||||
|
||||
_data.processed_environs.emplace("trace");
|
||||
_data.processed_environs.emplace("trace_legacy");
|
||||
}
|
||||
|
||||
if(_data.environ_filter("profile", _data))
|
||||
|
||||
@@ -299,20 +299,32 @@ configure_settings(bool _init)
|
||||
get_env<size_t>("ROCPROFSYS_NUM_THREADS", 1), "threading", "performance",
|
||||
"sampling", "parallelism", "advanced");
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_TRACE", "Enable perfetto backend",
|
||||
_default_perfetto_v, "backend", "perfetto");
|
||||
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_TRACE_CACHED",
|
||||
"Enable perfetto backend with deferred trace generation "
|
||||
"for minimal runtime overhead",
|
||||
_default_perfetto_v, "backend", "perfetto_caching");
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_TRACE_LEGACY",
|
||||
"Enable perfetto backend (legacy, direct mode)", false,
|
||||
"backend", "perfetto");
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_TRACE",
|
||||
"[DEPRECATED] Renamed to ROCPROFSYS_TRACE_LEGACY", false,
|
||||
"backend", "perfetto", "deprecated");
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_USE_PERFETTO",
|
||||
"[DEPRECATED] Renamed to ROCPROFSYS_TRACE",
|
||||
_default_perfetto_v, "backend", "perfetto", "deprecated");
|
||||
"[DEPRECATED] Renamed to ROCPROFSYS_TRACE_LEGACY", false,
|
||||
"backend", "perfetto", "deprecated");
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_PROFILE", "Enable timemory backend",
|
||||
!_config->get<bool>("ROCPROFSYS_TRACE"), "backend",
|
||||
"timemory");
|
||||
!(_config->get<bool>("ROCPROFSYS_TRACE_LEGACY") ||
|
||||
_config->get<bool>("ROCPROFSYS_TRACE_CACHED")),
|
||||
"backend", "timemory");
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(
|
||||
bool, "ROCPROFSYS_USE_TIMEMORY", "[DEPRECATED] Renamed to ROCPROFSYS_PROFILE",
|
||||
!_config->get<bool>("ROCPROFSYS_TRACE"), "backend", "timemory", "deprecated");
|
||||
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_USE_TIMEMORY",
|
||||
"[DEPRECATED] Renamed to ROCPROFSYS_PROFILE",
|
||||
!_config->get<bool>("ROCPROFSYS_TRACE_LEGACY"), "backend",
|
||||
"timemory", "deprecated");
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_USE_CAUSAL",
|
||||
"Enable causal profiling analysis", false, "backend",
|
||||
@@ -321,10 +333,6 @@ configure_settings(bool _init)
|
||||
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_USE_ROCPD", "Enable rocpd backend", false,
|
||||
"backend", "rocpd");
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_TRACE_CACHED",
|
||||
"Enable perfetto with trace cache", false, "backend",
|
||||
"perfetto_caching");
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_USE_ROCM",
|
||||
"Enable ROCm API and kernel tracing", true, "backend",
|
||||
"rocm");
|
||||
@@ -1060,7 +1068,8 @@ configure_settings(bool _init)
|
||||
handle_deprecated_setting("ROCPROFSYS_USE_THREAD_SAMPLING",
|
||||
"ROCPROFSYS_USE_PROCESS_SAMPLING");
|
||||
handle_deprecated_setting("ROCPROFSYS_OUTPUT_FILE", "ROCPROFSYS_PERFETTO_FILE");
|
||||
handle_deprecated_setting("ROCPROFSYS_USE_PERFETTO", "ROCPROFSYS_TRACE");
|
||||
handle_deprecated_setting("ROCPROFSYS_USE_PERFETTO", "ROCPROFSYS_TRACE_LEGACY");
|
||||
handle_deprecated_setting("ROCPROFSYS_TRACE", "ROCPROFSYS_TRACE_LEGACY");
|
||||
handle_deprecated_setting("ROCPROFSYS_USE_TIMEMORY", "ROCPROFSYS_PROFILE");
|
||||
|
||||
scope::get_fields()[scope::flat::value] = _config->get_flat_profile();
|
||||
@@ -1127,7 +1136,8 @@ configure_mode_settings(const std::shared_ptr<settings>& _config)
|
||||
if(get_mode() == Mode::Coverage)
|
||||
{
|
||||
set_default_setting_value("ROCPROFSYS_USE_CODE_COVERAGE", true);
|
||||
_set("ROCPROFSYS_TRACE", false);
|
||||
_set("ROCPROFSYS_TRACE_LEGACY", false);
|
||||
_set("ROCPROFSYS_TRACE_CACHED", false);
|
||||
_set("ROCPROFSYS_PROFILE", false);
|
||||
_set("ROCPROFSYS_USE_CAUSAL", false);
|
||||
_set("ROCPROFSYS_USE_AMD_SMI", false);
|
||||
@@ -1140,7 +1150,8 @@ configure_mode_settings(const std::shared_ptr<settings>& _config)
|
||||
else if(get_mode() == Mode::Causal)
|
||||
{
|
||||
_set("ROCPROFSYS_USE_CAUSAL", true);
|
||||
_set("ROCPROFSYS_TRACE", false);
|
||||
_set("ROCPROFSYS_TRACE_LEGACY", false);
|
||||
_set("ROCPROFSYS_TRACE_CACHED", false);
|
||||
_set("ROCPROFSYS_PROFILE", false);
|
||||
_set("ROCPROFSYS_USE_SAMPLING", false);
|
||||
_set("ROCPROFSYS_USE_PROCESS_SAMPLING", false);
|
||||
@@ -1836,7 +1847,7 @@ get_verbose()
|
||||
bool&
|
||||
get_use_perfetto()
|
||||
{
|
||||
static auto _v = get_config()->at("ROCPROFSYS_TRACE");
|
||||
static auto _v = get_config()->at("ROCPROFSYS_TRACE_LEGACY");
|
||||
return static_cast<tim::tsettings<bool>&>(*_v).get();
|
||||
}
|
||||
|
||||
@@ -2144,11 +2155,12 @@ get_perfetto_backend()
|
||||
std::string
|
||||
get_perfetto_output_filename()
|
||||
{
|
||||
static auto _v = get_config()->find("ROCPROFSYS_PERFETTO_FILE");
|
||||
auto _val = static_cast<tim::tsettings<std::string>&>(*_v->second).get();
|
||||
auto _pos_dir = _val.find_last_of('/');
|
||||
auto _dir = std::string{};
|
||||
auto _ext = std::string{ "proto" };
|
||||
static auto _v = get_config()->find("ROCPROFSYS_PERFETTO_FILE");
|
||||
auto _val = static_cast<tim::tsettings<std::string>&>(*_v->second).get();
|
||||
|
||||
auto _pos_dir = _val.find_last_of('/');
|
||||
auto _dir = std::string{};
|
||||
auto _ext = std::string{ "proto" };
|
||||
if(_pos_dir != std::string::npos)
|
||||
{
|
||||
_dir = _val.substr(0, _pos_dir + 1);
|
||||
@@ -2161,12 +2173,38 @@ get_perfetto_output_filename()
|
||||
_val = _val.substr(0, _pos_ext);
|
||||
}
|
||||
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(
|
||||
2, "[get_perfetto_output_filename] Parsed: dir='%s', basename='%s', ext='%s'\n",
|
||||
_dir.c_str(), _val.c_str(), _ext.c_str());
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(
|
||||
2, "[get_perfetto_output_filename] settings::output_path()='%s'\n",
|
||||
settings::output_path().c_str());
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(
|
||||
2, "[get_perfetto_output_filename] settings::output_prefix()='%s'\n",
|
||||
settings::output_prefix().c_str());
|
||||
|
||||
auto _cfg = settings::compose_filename_config{ settings::use_output_suffix(),
|
||||
settings::default_process_suffix(),
|
||||
false, _dir };
|
||||
_val = settings::compose_output_filename(_val, _ext, _cfg);
|
||||
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(
|
||||
2, "[get_perfetto_output_filename] After compose_output_filename: '%s'\n",
|
||||
_val.c_str());
|
||||
|
||||
if(!_val.empty() && _val.at(0) != '/')
|
||||
return settings::format(JOIN('/', "%env{PWD}%", _val), get_config()->get_tag());
|
||||
{
|
||||
auto _result =
|
||||
settings::format(JOIN('/', "%env{PWD}%", _val), get_config()->get_tag());
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(
|
||||
2, "[get_perfetto_output_filename] Path is relative, prepending PWD: '%s'\n",
|
||||
_result.c_str());
|
||||
return _result;
|
||||
}
|
||||
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(
|
||||
2, "[get_perfetto_output_filename] Path is absolute, returning: '%s'\n",
|
||||
_val.c_str());
|
||||
return _val;
|
||||
}
|
||||
|
||||
@@ -2426,8 +2464,20 @@ get_perfetto_output_filename_with_suffix(std::string_view suffix)
|
||||
static auto _v = get_config()->find("ROCPROFSYS_PERFETTO_FILE");
|
||||
auto _val = static_cast<tim::tsettings<std::string>&>(*_v->second).get();
|
||||
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(2,
|
||||
"[get_perfetto_output_filename_with_suffix] Initial "
|
||||
"ROCPROFSYS_PERFETTO_FILE='%s', suffix='%s'\n",
|
||||
_val.c_str(), std::string{ suffix }.c_str());
|
||||
|
||||
// If absolute path is provided, return it as-is
|
||||
if(!_val.empty() && _val.at(0) == '/') return _val;
|
||||
if(!_val.empty() && _val.at(0) == '/')
|
||||
{
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(
|
||||
2,
|
||||
"[get_perfetto_output_filename_with_suffix] Absolute path, returning: '%s'\n",
|
||||
_val.c_str());
|
||||
return _val;
|
||||
}
|
||||
|
||||
auto _pos_dir = _val.find_last_of('/');
|
||||
auto _dir = std::string{};
|
||||
@@ -2451,6 +2501,15 @@ get_perfetto_output_filename_with_suffix(std::string_view suffix)
|
||||
bool _explicitly_set =
|
||||
(_v->second->get_environ_updated() || _v->second->get_config_updated());
|
||||
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(
|
||||
2,
|
||||
"[get_perfetto_output_filename_with_suffix] Parsed: dir='%s', basename='%s', "
|
||||
"ext='%s', explicitly_set=%s\n",
|
||||
_dir.c_str(), _val.c_str(), _ext.c_str(), _explicitly_set ? "true" : "false");
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(
|
||||
2, "[get_perfetto_output_filename_with_suffix] settings::output_path()='%s'\n",
|
||||
settings::output_path().c_str());
|
||||
|
||||
auto _cfg = settings::compose_filename_config{
|
||||
!_explicitly_set && !suffix.empty(), // use_suffix only if not explicitly set
|
||||
suffix, // suffix value
|
||||
@@ -2459,9 +2518,27 @@ get_perfetto_output_filename_with_suffix(std::string_view suffix)
|
||||
};
|
||||
|
||||
_val = settings::compose_output_filename(_val, _ext, _cfg);
|
||||
if(!_val.empty() && _val.at(0) != '/')
|
||||
return settings::format(JOIN('/', "%env{PWD}%", _val), get_config()->get_tag());
|
||||
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(2,
|
||||
"[get_perfetto_output_filename_with_suffix] After "
|
||||
"compose_output_filename: '%s'\n",
|
||||
_val.c_str());
|
||||
|
||||
if(!_val.empty() && _val.at(0) != '/')
|
||||
{
|
||||
auto _result =
|
||||
settings::format(JOIN('/', "%env{PWD}%", _val), get_config()->get_tag());
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(2,
|
||||
"[get_perfetto_output_filename_with_suffix] Path is "
|
||||
"relative, prepending PWD: '%s'\n",
|
||||
_result.c_str());
|
||||
return _result;
|
||||
}
|
||||
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(
|
||||
2,
|
||||
"[get_perfetto_output_filename_with_suffix] Path is absolute, returning: '%s'\n",
|
||||
_val.c_str());
|
||||
return _val;
|
||||
}
|
||||
|
||||
|
||||
@@ -243,6 +243,7 @@ post_process(tim::manager* _timemory_manager, bool& _perfetto_output_error)
|
||||
#endif
|
||||
|
||||
auto _filename = config::get_perfetto_output_filename();
|
||||
|
||||
if(!trace_data.empty())
|
||||
{
|
||||
operation::file_output_message<tim::project::rocprofsys> _fom{};
|
||||
|
||||
+44
-63
@@ -668,9 +668,40 @@ perfetto_processor_t::handle(const region_sample& _rs)
|
||||
annotate_perfetto(ctx, annotations);
|
||||
};
|
||||
|
||||
tracing::push_perfetto_ts(category::rocm{}, _name.c_str(), _beg_ts,
|
||||
::perfetto::Flow::ProcessScoped(_corr_id), add_annotations);
|
||||
tracing::pop_perfetto_ts(category::rocm{}, _name.c_str(), _end_ts);
|
||||
auto emit_trace = [&](auto category_tag) {
|
||||
using CategoryT = decltype(category_tag);
|
||||
tracing::push_perfetto_ts(CategoryT{}, _name.c_str(), _beg_ts,
|
||||
::perfetto::Flow::ProcessScoped(_corr_id),
|
||||
add_annotations);
|
||||
tracing::pop_perfetto_ts(CategoryT{}, _name.c_str(), _end_ts);
|
||||
};
|
||||
|
||||
auto try_category = [&](auto category_tag) {
|
||||
using CategoryT = decltype(category_tag);
|
||||
if(_category == trait::name<CategoryT>::value)
|
||||
{
|
||||
emit_trace(category_tag);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
bool dispatched =
|
||||
(try_category(category::host{}) || try_category(category::user{}) ||
|
||||
try_category(category::python{}) || try_category(category::mpi{}) ||
|
||||
try_category(category::pthread{}) || try_category(category::kokkos{}) ||
|
||||
try_category(category::rocm_hip_api{}) ||
|
||||
try_category(category::rocm_hsa_api{}) ||
|
||||
try_category(category::rocm_marker_api{}) ||
|
||||
try_category(category::rocm_rccl{}) ||
|
||||
try_category(category::rocm_rocdecode_api{}) ||
|
||||
try_category(category::rocm_rocjpeg_api{}) || try_category(category::vaapi{}));
|
||||
|
||||
if(!dispatched)
|
||||
{
|
||||
// Default to rocm category for backward compatibility
|
||||
emit_trace(category::rocm{});
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
@@ -894,32 +925,6 @@ perfetto_processor_t::handle([[maybe_unused]] const pmc_event_with_sample& _pmc)
|
||||
void
|
||||
perfetto_processor_t::handle([[maybe_unused]] const amd_smi_sample& _amd_smi)
|
||||
{
|
||||
// using amd_smi_gfx_track = perfetto_counter_track<category::amd_smi_gfx_busy>;
|
||||
// using amd_smi_umc_track = perfetto_counter_track<category::amd_smi_umc_busy>;
|
||||
// using amd_smi_mm_track = perfetto_counter_track<category::amd_smi_mm_busy>;
|
||||
// using amd_smi_temp_track = perfetto_counter_track<category::amd_smi_temp>;
|
||||
// using amd_smi_power_track = perfetto_counter_track<category::amd_smi_power>;
|
||||
// using amd_smi_mem_track = perfetto_counter_track<category::amd_smi_memory_usage>;
|
||||
// using amd_smi_vcn_track = perfetto_counter_track<category::amd_smi_vcn_activity>;
|
||||
// using amd_smi_jpeg_track =
|
||||
// perfetto_counter_track<category::amd_smi_jpeg_activity>; using
|
||||
// amd_smi_xgmi_link_width_track =
|
||||
// perfetto_counter_track<category::amd_smi_xgmi_link_width>;
|
||||
// using amd_smi_xgmi_link_speed_track =
|
||||
// perfetto_counter_track<category::amd_smi_xgmi_link_speed>;
|
||||
// using amd_smi_xgmi_read_track =
|
||||
// perfetto_counter_track<category::amd_smi_xgmi_read_data>;
|
||||
// using amd_smi_xgmi_write_track =
|
||||
// perfetto_counter_track<category::amd_smi_xgmi_write_data>;
|
||||
// using amd_smi_pcie_link_width_track =
|
||||
// perfetto_counter_track<category::amd_smi_pcie_link_width>;
|
||||
// using amd_smi_pcie_link_speed_track =
|
||||
// perfetto_counter_track<category::amd_smi_pcie_link_speed>;
|
||||
// using amd_smi_pcie_bandwidth_acc_track =
|
||||
// perfetto_counter_track<category::amd_smi_pcie_bandwidth_acc>;
|
||||
// using amd_smi_pcie_bandwidth_inst_track =
|
||||
// perfetto_counter_track<category::amd_smi_pcie_bandwidth_inst>;
|
||||
|
||||
// Use the shared gpu_metrics_t from core/gpu_metrics.hpp
|
||||
using gpu_metrics_t = gpu::gpu_metrics_t;
|
||||
|
||||
@@ -937,36 +942,6 @@ perfetto_processor_t::handle([[maybe_unused]] const amd_smi_sample& _amd_smi)
|
||||
auto _ts = _amd_smi.timestamp;
|
||||
auto _device_id = _amd_smi.device_id;
|
||||
|
||||
// auto setup_tracks = [&]() {
|
||||
// if(amd_smi_gfx_track::exists(_device_id)) return;
|
||||
|
||||
// auto make_track_name = [&](const char* metric) {
|
||||
// return JOIN(" ", "GPU", JOIN("", '[', _device_id, ']'), metric, "(S)");
|
||||
// };
|
||||
|
||||
// if(is_busy_enabled)
|
||||
// {
|
||||
// amd_smi_gfx_track::emplace(_device_id, make_track_name("GFX Busy"), "%");
|
||||
// amd_smi_umc_track::emplace(_device_id, make_track_name("UMC Busy"), "%");
|
||||
// amd_smi_mm_track::emplace(_device_id, make_track_name("MM Busy"), "%");
|
||||
// }
|
||||
// if(is_temp_enabled)
|
||||
// {
|
||||
// amd_smi_temp_track::emplace(_device_id, make_track_name("Temperature"),
|
||||
// "deg C");
|
||||
// }
|
||||
// if(is_power_enabled)
|
||||
// {
|
||||
// amd_smi_power_track::emplace(_device_id, make_track_name("Power"), "W");
|
||||
// }
|
||||
// if(is_mem_usage_enabled)
|
||||
// {
|
||||
// amd_smi_mem_track::emplace(_device_id, make_track_name("Memory Usage"),
|
||||
// "MB");
|
||||
// }
|
||||
// };
|
||||
|
||||
// setup_tracks();
|
||||
setup_amd_smi_tracks(_device_id, is_busy_enabled, is_temp_enabled, is_power_enabled,
|
||||
is_mem_usage_enabled);
|
||||
|
||||
@@ -1013,6 +988,14 @@ perfetto_processor_t::handle([[maybe_unused]] const amd_smi_sample& _amd_smi)
|
||||
|
||||
using Category = std::decay_t<decltype(category)>;
|
||||
|
||||
const char* metric_name = nullptr;
|
||||
if constexpr(std::is_same_v<Category, category::amd_smi_vcn_activity>)
|
||||
metric_name = "VCN Activity";
|
||||
else if constexpr(std::is_same_v<Category, category::amd_smi_jpeg_activity>)
|
||||
metric_name = "JPEG Activity";
|
||||
else
|
||||
metric_name = trait::name<Category>::value;
|
||||
|
||||
for(size_t i = 0; i < data.size(); ++i)
|
||||
{
|
||||
const auto value = data[i];
|
||||
@@ -1023,16 +1006,14 @@ perfetto_processor_t::handle([[maybe_unused]] const amd_smi_sample& _amd_smi)
|
||||
{
|
||||
// Per-XCP format
|
||||
track_name = JOIN(
|
||||
" ", "GPU", JOIN("", '[', _device_id, ']'),
|
||||
trait::name<Category>::value,
|
||||
" ", "GPU", JOIN("", '[', _device_id, ']'), metric_name,
|
||||
JOIN("", "XCP_", _idx.value(), ": [", (i < 10 ? "0" : ""), i, ']'),
|
||||
"(S)");
|
||||
}
|
||||
else
|
||||
{
|
||||
// Device-level format
|
||||
track_name = JOIN(" ", "GPU", JOIN("", '[', _device_id, ']'),
|
||||
trait::name<Category>::value,
|
||||
track_name = JOIN(" ", "GPU", JOIN("", '[', _device_id, ']'), metric_name,
|
||||
JOIN("", "[", (i < 10 ? "0" : ""), i, ']'), "(S)");
|
||||
}
|
||||
|
||||
|
||||
@@ -103,6 +103,7 @@ extern "C"
|
||||
void rocprofsys_set_mpi_hidden(bool, bool) ROCPROFSYS_HIDDEN_API;
|
||||
void rocprofsys_push_trace_hidden(const char*) ROCPROFSYS_HIDDEN_API;
|
||||
void rocprofsys_pop_trace_hidden(const char*) ROCPROFSYS_HIDDEN_API;
|
||||
void rocprofsys_flush_pending_region_cache_hidden() ROCPROFSYS_HIDDEN_API;
|
||||
void rocprofsys_push_region_hidden(const char*) ROCPROFSYS_HIDDEN_API;
|
||||
void rocprofsys_pop_region_hidden(const char*) ROCPROFSYS_HIDDEN_API;
|
||||
void rocprofsys_push_category_region_hidden(rocprofsys_category_t, const char*,
|
||||
|
||||
@@ -899,11 +899,12 @@ rocprofsys_finalize_hidden(void)
|
||||
#endif
|
||||
|
||||
ROCPROFSYS_DEBUG_F("Stopping and destroying instrumentation bundles...\n");
|
||||
for(size_t i = 0; i < thread_info::get_peak_num_threads(); ++i)
|
||||
auto* _bundles = instrumentation_bundles::get();
|
||||
for(size_t i = 0; _bundles && i < thread_info::get_peak_num_threads(); ++i)
|
||||
{
|
||||
if(!instrumentation_bundles::get()) continue;
|
||||
if(i >= _bundles->size()) continue;
|
||||
const auto& _info = thread_info::get(i, SequentTID);
|
||||
auto& itr = instrumentation_bundles::get()->at(i);
|
||||
auto& itr = _bundles->at(i);
|
||||
while(itr != nullptr && !itr->empty())
|
||||
{
|
||||
int _lvl = 1;
|
||||
@@ -1026,6 +1027,11 @@ rocprofsys_finalize_hidden(void)
|
||||
|
||||
tracing::copy_timemory_hash_ids();
|
||||
|
||||
// Flush any pending region cache entries (e.g., main entry point that wasn't
|
||||
// explicitly stopped before finalization)
|
||||
ROCPROFSYS_DEBUG_F("Flushing pending region cache entries...\n");
|
||||
rocprofsys_flush_pending_region_cache_hidden();
|
||||
|
||||
bool _perfetto_output_error = false;
|
||||
if(get_use_perfetto())
|
||||
{
|
||||
|
||||
+25
@@ -120,6 +120,31 @@ cache_stop(const char* name)
|
||||
rocprofsys::trait::name<CategoryT>::value);
|
||||
}
|
||||
}
|
||||
|
||||
/// Flush all pending cached entries for this thread.
|
||||
/// Called during finalization to ensure entries that were started but not stopped
|
||||
/// (e.g., main entry point) are written to the trace cache.
|
||||
inline void
|
||||
flush_pending_cached_entries()
|
||||
{
|
||||
const auto end_ts = static_cast<timestamp_t>(rocprofsys::comp::wall_clock::record());
|
||||
uint64_t thread_id = 0;
|
||||
|
||||
const auto& extended_info = rocprofsys::thread_info::get(std::this_thread::get_id());
|
||||
if(extended_info.has_value() && extended_info->index_data.has_value())
|
||||
{
|
||||
constexpr size_t UNKNOWN_TIME = 0;
|
||||
thread_id = extended_info->index_data->system_value;
|
||||
rocprofsys::trace_cache::get_metadata_registry().add_thread_info(
|
||||
{ getppid(), getpid(), thread_id, UNKNOWN_TIME, UNKNOWN_TIME, "{}" });
|
||||
}
|
||||
|
||||
for(const auto& [key, start_ts] : map_name_to_args)
|
||||
{
|
||||
cache_region(thread_id, key.name, start_ts, end_ts, key.category);
|
||||
}
|
||||
map_name_to_args.clear();
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace tim
|
||||
|
||||
+15
-5
@@ -533,12 +533,22 @@ void
|
||||
cache_region(const rocprofiler_callback_tracing_record_t* record,
|
||||
const rocprofiler_timestamp_t start_timestamp,
|
||||
const rocprofiler_timestamp_t end_timestamp, const std::string& call_stack,
|
||||
const std::string& args_str, const std::string& category)
|
||||
const std::string& args_str, const std::string& category,
|
||||
std::string_view name = {})
|
||||
|
||||
{
|
||||
auto callback_tracing_info =
|
||||
trace_cache::get_metadata_registry().get_callback_tracing_info();
|
||||
auto _name = std::string{ callback_tracing_info.at(record->kind, record->operation) };
|
||||
// Use provided name if available, otherwise fall back to API operation name
|
||||
std::string _name;
|
||||
if(name.empty())
|
||||
{
|
||||
auto callback_tracing_info =
|
||||
trace_cache::get_metadata_registry().get_callback_tracing_info();
|
||||
_name = std::string{ callback_tracing_info.at(record->kind, record->operation) };
|
||||
}
|
||||
else
|
||||
{
|
||||
_name = std::string{ name };
|
||||
}
|
||||
|
||||
trace_cache::get_buffer_storage().store(trace_cache::region_sample{
|
||||
record->thread_id, _name.c_str(), record->correlation_id.internal,
|
||||
@@ -814,7 +824,7 @@ tool_tracing_callback_stop(
|
||||
cache_add_thread_info(record.thread_id);
|
||||
std::string args_str = get_args_string(args);
|
||||
cache_region(&record, _beg_ts, _end_ts, call_stack.dump(), args_str,
|
||||
trait::name<CategoryT>::value);
|
||||
trait::name<CategoryT>::value, _name);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -31,6 +31,7 @@
|
||||
#include "core/state.hpp"
|
||||
#include "core/timemory.hpp"
|
||||
#include "core/utility.hpp"
|
||||
#include "library/thread_data_growth.hpp"
|
||||
#include "library/thread_deleter.hpp"
|
||||
|
||||
#include <timemory/utility/macros.hpp>
|
||||
@@ -54,15 +55,6 @@ using instrumentation_bundle_t =
|
||||
// allocator for instrumentation_bundle_t
|
||||
using bundle_allocator_t = tim::data::ring_buffer_allocator<instrumentation_bundle_t>;
|
||||
|
||||
using grow_functor_t = int64_t (*)(int64_t);
|
||||
|
||||
inline auto&
|
||||
grow_functors()
|
||||
{
|
||||
static auto _v = container::stable_vector<grow_functor_t>{};
|
||||
return _v;
|
||||
}
|
||||
|
||||
template <typename Tp>
|
||||
struct base_thread_data
|
||||
{
|
||||
@@ -77,7 +69,16 @@ struct base_thread_data
|
||||
}
|
||||
return (_v) ? _v->capacity() : 0;
|
||||
};
|
||||
grow_functors().emplace_back(std::move(_func));
|
||||
grow_functors().emplace_back(_func);
|
||||
|
||||
// Immediately sync this container to current peak_num_threads.
|
||||
// This ensures containers instantiated after threads exceed
|
||||
// max_supported_threads are properly sized.
|
||||
auto _current_peak = get_current_peak_num_threads();
|
||||
if(_current_peak > static_cast<int64_t>(max_supported_threads))
|
||||
{
|
||||
_func(_current_peak);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
+63
@@ -0,0 +1,63 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "core/concepts.hpp"
|
||||
#include "core/containers/stable_vector.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
using grow_functor_t = int64_t (*)(int64_t);
|
||||
|
||||
inline auto&
|
||||
grow_functors()
|
||||
{
|
||||
static auto _v = container::stable_vector<grow_functor_t>{};
|
||||
return _v;
|
||||
}
|
||||
|
||||
inline auto&
|
||||
get_peak_num_threads_callback()
|
||||
{
|
||||
static std::function<int64_t()> _v = []() -> int64_t {
|
||||
return static_cast<int64_t>(max_supported_threads);
|
||||
};
|
||||
return _v;
|
||||
}
|
||||
|
||||
inline int64_t
|
||||
get_current_peak_num_threads()
|
||||
{
|
||||
return get_peak_num_threads_callback()();
|
||||
}
|
||||
|
||||
inline void
|
||||
set_peak_num_threads_callback(std::function<int64_t()> _cb)
|
||||
{
|
||||
get_peak_num_threads_callback() = std::move(_cb);
|
||||
}
|
||||
|
||||
} // namespace rocprofsys
|
||||
@@ -30,6 +30,7 @@
|
||||
#include "library/causal/delay.hpp"
|
||||
#include "library/runtime.hpp"
|
||||
#include "library/thread_data.hpp"
|
||||
#include "library/thread_data_growth.hpp"
|
||||
|
||||
#include <timemory/backends/threading.hpp>
|
||||
#include <timemory/components/timing/backends.hpp>
|
||||
@@ -111,6 +112,13 @@ init_index_data(int64_t _tid, bool _offset = false)
|
||||
thread_local int64_t offset_causal_count = 0;
|
||||
const auto unknown_thread = std::optional<thread_info>{};
|
||||
int64_t peak_num_threads = max_supported_threads;
|
||||
|
||||
// Register callback to allow thread_data containers to query peak_num_threads
|
||||
// when they are instantiated, ensuring late-instantiated containers are properly sized.
|
||||
const auto peak_num_threads_callback_registered = []() {
|
||||
set_peak_num_threads_callback([]() -> int64_t { return peak_num_threads; });
|
||||
return true;
|
||||
}();
|
||||
} // namespace
|
||||
|
||||
std::string
|
||||
|
||||
@@ -124,6 +124,12 @@ rocprofsys_pop_trace_hidden(const char* name)
|
||||
rocprofsys::component::category_region<rocprofsys::category::host>::stop(name);
|
||||
}
|
||||
|
||||
extern "C" void
|
||||
rocprofsys_flush_pending_region_cache_hidden()
|
||||
{
|
||||
flush_pending_cached_entries();
|
||||
}
|
||||
|
||||
//======================================================================================//
|
||||
///
|
||||
///
|
||||
|
||||
+9
-9
@@ -60,24 +60,24 @@
|
||||
},
|
||||
{
|
||||
"comparison": "equals",
|
||||
"description": "Verify that 'roctxMarkA' appears at 5 times in table 'regions'",
|
||||
"error_message": "Expected 5 'roctxMarkA' entries in `regions` table",
|
||||
"description": "Verify that roctxMark markers appear 5 times in table 'regions'",
|
||||
"error_message": "Expected 5 roctxMark marker entries in `regions` table",
|
||||
"expected_result": 5,
|
||||
"query": "SELECT COUNT(*) FROM regions WHERE category = 'rocm_marker_api' AND name = 'roctxMarkA';"
|
||||
"query": "SELECT COUNT(*) FROM regions WHERE category = 'rocm_marker_api' AND name LIKE 'roctxMark_%';"
|
||||
},
|
||||
{
|
||||
"comparison": "equals",
|
||||
"description": "Verify that 'roctxRangePop' appears at 3 times in table 'regions'",
|
||||
"error_message": "Expected 3 'roctxRangePop' entries in `regions` table",
|
||||
"description": "Verify that roctxRangePush markers appear 3 times in table 'regions'",
|
||||
"error_message": "Expected 3 roctxRangePush marker entries in `regions` table",
|
||||
"expected_result": 3,
|
||||
"query": "SELECT COUNT(*) FROM regions WHERE category = 'rocm_marker_api' AND name = 'roctxRangePop';"
|
||||
"query": "SELECT COUNT(*) FROM regions WHERE category = 'rocm_marker_api' AND name LIKE 'roctxRangePush_%';"
|
||||
},
|
||||
{
|
||||
"comparison": "equals",
|
||||
"description": "Verify that 'roctxRangeStop' appears at 2 times in table 'regions'",
|
||||
"error_message": "Expected 2 'roctxRangeStop' entries in `regions` table",
|
||||
"description": "Verify that roctxRangeStart markers appear 2 times in table 'regions'",
|
||||
"error_message": "Expected 2 roctxRangeStart marker entries in `regions` table",
|
||||
"expected_result": 2,
|
||||
"query": "SELECT COUNT(*) FROM regions WHERE category = 'rocm_marker_api' AND name = 'roctxRangeStop';"
|
||||
"query": "SELECT COUNT(*) FROM regions WHERE category = 'rocm_marker_api' AND name LIKE 'roctxRangeStart_%';"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
@@ -36,6 +36,8 @@ if(
|
||||
)
|
||||
set(_annotate_environment
|
||||
"${_base_environment}"
|
||||
"ROCPROFSYS_TRACE_CACHED=OFF"
|
||||
"ROCPROFSYS_TRACE_LEGACY=ON"
|
||||
"ROCPROFSYS_TIMEMORY_COMPONENTS=thread_cpu_clock papi_array"
|
||||
"ROCPROFSYS_PAPI_EVENTS=perf::PERF_COUNT_SW_CPU_CLOCK"
|
||||
"ROCPROFSYS_USE_SAMPLING=OFF"
|
||||
@@ -82,6 +84,8 @@ if(
|
||||
else()
|
||||
set(_annotate_environment
|
||||
"${_base_environment}"
|
||||
"ROCPROFSYS_TRACE_CACHED=OFF"
|
||||
"ROCPROFSYS_TRACE_LEGACY=ON"
|
||||
"ROCPROFSYS_TIMEMORY_COMPONENTS=thread_cpu_clock"
|
||||
"ROCPROFSYS_USE_SAMPLING=OFF"
|
||||
)
|
||||
|
||||
@@ -233,7 +233,7 @@ rocprofiler_systems_add_bin_test(
|
||||
TIMEOUT 45
|
||||
PASS_REGEX
|
||||
"ENVIRONMENT VARIABLE,[ \n]+ROCPROFSYS_CI_SKIP_PUSH_POP_CHECK,[ \n]+ROCPROFSYS_THREAD_POOL_SIZE,[ \n]+ROCPROFSYS_USE_PID,[ \n]+"
|
||||
FAIL_REGEX "ROCPROFSYS_TRACE|ROCPROFSYS_ABORT_FAIL_REGEX"
|
||||
FAIL_REGEX "ROCPROFSYS_TRACE_LEGACY|ROCPROFSYS_TRACE_CACHED|ROCPROFSYS_ABORT_FAIL_REGEX"
|
||||
)
|
||||
|
||||
string(
|
||||
@@ -270,7 +270,7 @@ rocprofiler_systems_add_bin_test(
|
||||
txt json xml --force
|
||||
TIMEOUT 45
|
||||
LABELS "rocprofiler-systems-avail"
|
||||
ENVIRONMENT "ROCPROFSYS_TRACE=OFF;ROCPROFSYS_PROFILE=ON"
|
||||
ENVIRONMENT "ROCPROFSYS_TRACE_LEGACY=OFF;ROCPROFSYS_TRACE_CACHED=OFF;ROCPROFSYS_PROFILE=ON"
|
||||
PASS_REGEX
|
||||
"Outputting JSON configuration file '${_AVAIL_CFG_PATH}tweak\\\.json'(.*)Outputting XML configuration file '${_AVAIL_CFG_PATH}tweak\\\.xml'(.*)Outputting text configuration file '${_AVAIL_CFG_PATH}tweak\\\.cfg'(.*)"
|
||||
)
|
||||
|
||||
@@ -26,7 +26,10 @@
|
||||
#
|
||||
# -------------------------------------------------------------------------------------- #
|
||||
|
||||
# Use legacy trace mode for AMD SMI counters - cached mode doesn't support real-time counter tracking
|
||||
set(_gpu_connect_environment
|
||||
"ROCPROFSYS_TRACE_CACHED=OFF"
|
||||
"ROCPROFSYS_TRACE_LEGACY=ON"
|
||||
"ROCPROFSYS_ROCM_DOMAINS=hip_runtime_api"
|
||||
"ROCPROFSYS_AMD_SMI_METRICS=busy,temp,power,xgmi,pcie"
|
||||
"ROCPROFSYS_SAMPLING_CPUS=none"
|
||||
|
||||
@@ -74,5 +74,5 @@ rocprofiler_systems_add_test(
|
||||
REWRITE_ARGS -e -v 2 --min-instructions=8
|
||||
RUN_ARGS 10 4 1000
|
||||
ENVIRONMENT
|
||||
"${_lock_environment};ROCPROFSYS_FLAT_PROFILE=ON;ROCPROFSYS_PROFILE=OFF;ROCPROFSYS_TRACE=ON;ROCPROFSYS_SAMPLING_KEEP_INTERNAL=OFF"
|
||||
"${_lock_environment};ROCPROFSYS_FLAT_PROFILE=ON;ROCPROFSYS_PROFILE=OFF;ROCPROFSYS_TRACE_LEGACY=OFF;ROCPROFSYS_TRACE_CACHED=ON;ROCPROFSYS_SAMPLING_KEEP_INTERNAL=OFF"
|
||||
)
|
||||
|
||||
@@ -51,9 +51,11 @@ rocprofiler_systems_add_test(
|
||||
REWRITE_RUN_PASS_REGEX
|
||||
"(/[A-Za-z-]+/perfetto-trace-0.proto).*(/[A-Za-z-]+/wall_clock-0.txt')"
|
||||
REWRITE_RUN_FAIL_REGEX
|
||||
"(perfetto-trace|trip_count|sampling_percent|sampling_cpu_clock|sampling_wall_clock|wall_clock)-[0-9][0-9]+.(json|txt|proto)|ROCPROFSYS_ABORT_FAIL_REGEX"
|
||||
"Outputting.*(perfetto-trace|trip_count|sampling_percent|sampling_cpu_clock|sampling_wall_clock|wall_clock)-[0-9][0-9]+.(json|txt|proto)|ROCPROFSYS_ABORT_FAIL_REGEX"
|
||||
)
|
||||
|
||||
# mpi-perfetto-merge requires legacy trace mode because MPI trace combining
|
||||
# uses MPI communication (mpi_get) which is only implemented in the legacy path
|
||||
rocprofiler_systems_add_test(
|
||||
SKIP_RUNTIME
|
||||
NAME "mpi-perfetto-merge"
|
||||
@@ -70,7 +72,8 @@ rocprofiler_systems_add_test(
|
||||
line
|
||||
--min-instructions
|
||||
0
|
||||
ENVIRONMENT "${_base_environment};ROCPROFSYS_VERBOSE=1"
|
||||
ENVIRONMENT
|
||||
"${_base_environment};ROCPROFSYS_VERBOSE=1;ROCPROFSYS_TRACE_CACHED=OFF;ROCPROFSYS_TRACE_LEGACY=ON;ROCPROFSYS_PERFETTO_COMBINE_TRACES=ON"
|
||||
REWRITE_RUN_PASS_REGEX
|
||||
"Successfully executed: .+rocprof-sys-merge-output.sh.*"
|
||||
REWRITE_RUN_FAIL_REGEX
|
||||
@@ -123,7 +126,8 @@ rocprofiler_systems_add_test(
|
||||
)
|
||||
|
||||
set(_mpip_environment
|
||||
"ROCPROFSYS_TRACE=ON"
|
||||
"ROCPROFSYS_TRACE_LEGACY=OFF"
|
||||
"ROCPROFSYS_TRACE_CACHED=ON"
|
||||
"ROCPROFSYS_PROFILE=ON"
|
||||
"ROCPROFSYS_USE_SAMPLING=OFF"
|
||||
"ROCPROFSYS_USE_PROCESS_SAMPLING=OFF"
|
||||
@@ -138,7 +142,8 @@ set(_mpip_environment
|
||||
)
|
||||
|
||||
set(_mpip_all2all_environment
|
||||
"ROCPROFSYS_TRACE=ON"
|
||||
"ROCPROFSYS_TRACE_LEGACY=OFF"
|
||||
"ROCPROFSYS_TRACE_CACHED=ON"
|
||||
"ROCPROFSYS_PROFILE=ON"
|
||||
"ROCPROFSYS_USE_SAMPLING=OFF"
|
||||
"ROCPROFSYS_USE_PROCESS_SAMPLING=OFF"
|
||||
|
||||
@@ -25,8 +25,11 @@ execute_process(
|
||||
|
||||
message(STATUS "The list of all PAPI network events is ${_event_list}")
|
||||
|
||||
# Use legacy trace mode for network stats - cached mode doesn't support real-time counter tracking
|
||||
set(_nic_perf_environment
|
||||
"${_base_environment}"
|
||||
"ROCPROFSYS_TRACE_CACHED=OFF"
|
||||
"ROCPROFSYS_TRACE_LEGACY=ON"
|
||||
"ROCPROFSYS_OUTPUT_PATH=${PROJECT_BINARY_DIR}/rocprof-sys-tests-output/nic-performance"
|
||||
"ROCPROFSYS_USE_PID=OFF"
|
||||
"ROCPROFSYS_VERBOSE=1"
|
||||
|
||||
@@ -22,7 +22,8 @@ if(NOT EXISTS "${ROCM_LLVM_LIB_PATH}/libomptarget.so" AND ROCPROFSYS_USE_ROCM)
|
||||
endif()
|
||||
|
||||
set(_ompt_environment
|
||||
"ROCPROFSYS_TRACE=ON"
|
||||
"ROCPROFSYS_TRACE_LEGACY=OFF"
|
||||
"ROCPROFSYS_TRACE_CACHED=ON"
|
||||
"ROCPROFSYS_PROFILE=ON"
|
||||
"ROCPROFSYS_TIME_OUTPUT=OFF"
|
||||
"ROCPROFSYS_USE_OMPT=ON"
|
||||
@@ -126,6 +127,7 @@ if(ROCPROFSYS_OMPVV_HOST_TESTS)
|
||||
-e -v 1 --label return args
|
||||
SAMPLING_TIMEOUT 300
|
||||
REWRITE_TIMEOUT 300
|
||||
RUNTIME_TIMEOUT 600
|
||||
ENVIRONMENT
|
||||
"${_ompt_environment};ROCPROFSYS_COUT_OUTPUT=ON;ROCPROFSYS_CI_SKIP_PUSH_POP_CHECK=ON"
|
||||
REWRITE_RUN_PASS_REGEX "${_OMPT_PASS_REGEX}"
|
||||
|
||||
@@ -34,7 +34,7 @@ rocprofiler_systems_add_test(
|
||||
RUNTIME_ARGS -e -i 256
|
||||
RUN_ARGS 30 4 1000
|
||||
ENVIRONMENT
|
||||
"${_lock_environment};ROCPROFSYS_PROFILE=ON;ROCPROFSYS_TRACE=ON;ROCPROFSYS_COLLAPSE_THREADS=OFF;ROCPROFSYS_SAMPLING_REALTIME=ON;ROCPROFSYS_SAMPLING_REALTIME_FREQ=10;ROCPROFSYS_SAMPLING_REALTIME_TIDS=0;ROCPROFSYS_SAMPLING_KEEP_INTERNAL=OFF"
|
||||
"${_lock_environment};ROCPROFSYS_PROFILE=ON;ROCPROFSYS_TRACE_LEGACY=OFF;ROCPROFSYS_TRACE_CACHED=ON;ROCPROFSYS_COLLAPSE_THREADS=OFF;ROCPROFSYS_SAMPLING_REALTIME=ON;ROCPROFSYS_SAMPLING_REALTIME_FREQ=10;ROCPROFSYS_SAMPLING_REALTIME_TIDS=0;ROCPROFSYS_SAMPLING_KEEP_INTERNAL=OFF"
|
||||
REWRITE_RUN_PASS_REGEX
|
||||
"wall_clock .*\\|_pthread_create .* 4 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000"
|
||||
RUNTIME_PASS_REGEX
|
||||
@@ -50,7 +50,7 @@ rocprofiler_systems_add_test(
|
||||
TrampRecursive
|
||||
RUN_ARGS 10 4 1000
|
||||
ENVIRONMENT
|
||||
"${_lock_environment};ROCPROFSYS_FLAT_PROFILE=ON;ROCPROFSYS_PROFILE=ON;ROCPROFSYS_TRACE=OFF;ROCPROFSYS_SAMPLING_KEEP_INTERNAL=OFF"
|
||||
"${_lock_environment};ROCPROFSYS_FLAT_PROFILE=ON;ROCPROFSYS_PROFILE=ON;ROCPROFSYS_TRACE_LEGACY=OFF;ROCPROFSYS_SAMPLING_KEEP_INTERNAL=OFF"
|
||||
REWRITE_RUN_PASS_REGEX
|
||||
"start_thread (.*) 4 (.*) pthread_mutex_lock (.*) 4000 (.*) pthread_mutex_unlock (.*) 4000"
|
||||
)
|
||||
|
||||
@@ -164,15 +164,28 @@ foreach(_VERSION ${ROCPROFSYS_PYTHON_VERSIONS})
|
||||
TEST
|
||||
""
|
||||
"NAME;TIMEMORY_METRIC;TIMEMORY_FILE;PERFETTO_FILE"
|
||||
"ARGS;PERFETTO_METRIC;ROCPD_FILE;ROCPD_RULES"
|
||||
"ARGS;TIMEMORY_ARGS;PERFETTO_ARGS;PERFETTO_METRIC;ROCPD_FILE;ROCPD_RULES"
|
||||
${ARGN}
|
||||
)
|
||||
|
||||
# Use specific args if provided, otherwise fall back to common ARGS
|
||||
if(TEST_TIMEMORY_ARGS)
|
||||
set(_TIMEMORY_VALIDATION_ARGS ${TEST_TIMEMORY_ARGS})
|
||||
else()
|
||||
set(_TIMEMORY_VALIDATION_ARGS ${TEST_ARGS})
|
||||
endif()
|
||||
|
||||
if(TEST_PERFETTO_ARGS)
|
||||
set(_PERFETTO_VALIDATION_ARGS ${TEST_PERFETTO_ARGS})
|
||||
else()
|
||||
set(_PERFETTO_VALIDATION_ARGS ${TEST_ARGS})
|
||||
endif()
|
||||
|
||||
rocprofiler_systems_add_python_test(
|
||||
NAME ${TEST_NAME}-validate-timemory
|
||||
COMMAND
|
||||
${_PYTHON_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/validate-timemory-json.py
|
||||
-m ${TEST_TIMEMORY_METRIC} ${TEST_ARGS} -i
|
||||
-m ${TEST_TIMEMORY_METRIC} ${_TIMEMORY_VALIDATION_ARGS} -i
|
||||
PYTHON_VERSION ${_VERSION}
|
||||
FILE rocprof-sys-tests-output/${TEST_NAME}/${_VERSION}/${TEST_TIMEMORY_FILE}
|
||||
DEPENDS ${TEST_NAME}-${_VERSION}
|
||||
@@ -185,7 +198,7 @@ foreach(_VERSION ${ROCPROFSYS_PYTHON_VERSIONS})
|
||||
NAME ${TEST_NAME}-validate-perfetto
|
||||
COMMAND
|
||||
${_PYTHON_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/validate-perfetto-proto.py
|
||||
-m ${TEST_PERFETTO_METRIC} ${TEST_ARGS} -p -t
|
||||
-m ${TEST_PERFETTO_METRIC} ${_PERFETTO_VALIDATION_ARGS} -p -t
|
||||
/opt/trace_processor/bin/trace_processor_shell -i
|
||||
PYTHON_VERSION ${_VERSION}
|
||||
FILE rocprof-sys-tests-output/${TEST_NAME}/${_VERSION}/${TEST_PERFETTO_FILE}
|
||||
@@ -217,7 +230,8 @@ foreach(_VERSION ${ROCPROFSYS_PYTHON_VERSIONS})
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
set(python_source_labels
|
||||
# Timemory validation uses hierarchical output with multiple entries at different depths
|
||||
set(python_source_timemory_labels
|
||||
main_loop
|
||||
run
|
||||
fib
|
||||
@@ -228,7 +242,7 @@ foreach(_VERSION ${ROCPROFSYS_PYTHON_VERSIONS})
|
||||
inefficient
|
||||
_sum
|
||||
)
|
||||
set(python_source_count
|
||||
set(python_source_timemory_count
|
||||
5
|
||||
3
|
||||
3
|
||||
@@ -239,7 +253,7 @@ foreach(_VERSION ${ROCPROFSYS_PYTHON_VERSIONS})
|
||||
3
|
||||
3
|
||||
)
|
||||
set(python_source_depth
|
||||
set(python_source_timemory_depth
|
||||
0
|
||||
1
|
||||
2
|
||||
@@ -251,6 +265,29 @@ foreach(_VERSION ${ROCPROFSYS_PYTHON_VERSIONS})
|
||||
3
|
||||
)
|
||||
|
||||
# Perfetto (cached mode) aggregates entries by name
|
||||
set(python_source_perfetto_labels
|
||||
main_loop
|
||||
run
|
||||
fib
|
||||
inefficient
|
||||
_sum
|
||||
)
|
||||
set(python_source_perfetto_count
|
||||
5
|
||||
3
|
||||
24
|
||||
3
|
||||
3
|
||||
)
|
||||
set(python_source_perfetto_depth
|
||||
0
|
||||
1
|
||||
2
|
||||
2
|
||||
3
|
||||
)
|
||||
|
||||
set(python_source_categories python user)
|
||||
|
||||
rocprofiler_systems_add_python_validation_test(
|
||||
@@ -259,14 +296,17 @@ foreach(_VERSION ${ROCPROFSYS_PYTHON_VERSIONS})
|
||||
TIMEMORY_FILE "trip_count.json"
|
||||
PERFETTO_FILE "perfetto-trace.proto"
|
||||
PERFETTO_METRIC ${python_source_categories}
|
||||
ARGS -l ${python_source_labels} -c ${python_source_count} -d
|
||||
${python_source_depth}
|
||||
TIMEMORY_ARGS -l ${python_source_timemory_labels} -c ${python_source_timemory_count} -d
|
||||
${python_source_timemory_depth}
|
||||
PERFETTO_ARGS -l ${python_source_perfetto_labels} -c ${python_source_perfetto_count} -d
|
||||
${python_source_perfetto_depth}
|
||||
ROCPD_FILE "rocpd.db"
|
||||
ROCPD_RULES
|
||||
"${CMAKE_CURRENT_LIST_DIR}/rocpd-validation-rules/python/python-source-rules.json"
|
||||
)
|
||||
|
||||
set(python_builtin_labels
|
||||
# Timemory validation uses hierarchical output with multiple entries at different depths
|
||||
set(python_builtin_timemory_labels
|
||||
[run][builtin.py:31]
|
||||
[fib][builtin.py:13]
|
||||
[fib][builtin.py:13]
|
||||
@@ -280,7 +320,7 @@ foreach(_VERSION ${ROCPROFSYS_PYTHON_VERSIONS})
|
||||
[fib][builtin.py:13]
|
||||
[inefficient][builtin.py:17]
|
||||
)
|
||||
set(python_builtin_count
|
||||
set(python_builtin_timemory_count
|
||||
5
|
||||
5
|
||||
10
|
||||
@@ -294,7 +334,7 @@ foreach(_VERSION ${ROCPROFSYS_PYTHON_VERSIONS})
|
||||
10
|
||||
5
|
||||
)
|
||||
set(python_builtin_depth
|
||||
set(python_builtin_timemory_depth
|
||||
0
|
||||
1
|
||||
2
|
||||
@@ -309,14 +349,26 @@ foreach(_VERSION ${ROCPROFSYS_PYTHON_VERSIONS})
|
||||
1
|
||||
)
|
||||
|
||||
# Perfetto validation with trace caching aggregates all calls to the same function,
|
||||
# so we only expect one entry per unique label rather than hierarchical entries.
|
||||
set(python_builtin_perfetto_labels
|
||||
[run][builtin.py:31]
|
||||
[fib][builtin.py:13]
|
||||
[inefficient][builtin.py:17]
|
||||
)
|
||||
set(python_builtin_perfetto_count 5 445 5)
|
||||
set(python_builtin_perfetto_depth 0 1 1)
|
||||
|
||||
rocprofiler_systems_add_python_validation_test(
|
||||
NAME python-builtin
|
||||
TIMEMORY_METRIC "trip_count"
|
||||
TIMEMORY_FILE "trip_count.json"
|
||||
PERFETTO_METRIC "python"
|
||||
PERFETTO_FILE "perfetto-trace.proto"
|
||||
ARGS -l ${python_builtin_labels} -c ${python_builtin_count} -d
|
||||
${python_builtin_depth}
|
||||
TIMEMORY_ARGS -l ${python_builtin_timemory_labels} -c ${python_builtin_timemory_count} -d
|
||||
${python_builtin_timemory_depth}
|
||||
PERFETTO_ARGS -l ${python_builtin_perfetto_labels} -c ${python_builtin_perfetto_count} -d
|
||||
${python_builtin_perfetto_depth}
|
||||
ROCPD_FILE "rocpd.db"
|
||||
ROCPD_RULES
|
||||
"${CMAKE_CURRENT_LIST_DIR}/rocpd-validation-rules/python/python-builtin-rules.json"
|
||||
|
||||
@@ -37,8 +37,11 @@ endif()
|
||||
# -------------------------------------------------------------------------------------- #
|
||||
|
||||
# Ensure ROCPROFSYS_ROCM_DOMAINS is defined
|
||||
# Use legacy trace mode for roctx tests to preserve depth information
|
||||
set(_roctx_environment
|
||||
"${_base_environment}"
|
||||
"ROCPROFSYS_TRACE_LEGACY=ON"
|
||||
"ROCPROFSYS_TRACE_CACHED=OFF"
|
||||
"ROCPROFSYS_ROCM_DOMAINS=hip_runtime_api,marker_api,kernel_dispatch"
|
||||
)
|
||||
|
||||
@@ -56,7 +59,8 @@ rocprofiler_systems_add_test(
|
||||
ENVIRONMENT "${_roctx_environment}"
|
||||
)
|
||||
|
||||
set(ROCTX_LABEL
|
||||
# Legacy mode preserves individual entries with their original depths
|
||||
set(ROCTX_LEGACY_LABEL
|
||||
roctxMark_GPU_workload
|
||||
roctxRangePush_run_profiling
|
||||
roctxRangeStart_GPU_Compute
|
||||
@@ -70,7 +74,7 @@ set(ROCTX_LABEL
|
||||
roctxMark_Finished_GPU
|
||||
)
|
||||
|
||||
set(ROCTX_COUNT
|
||||
set(ROCTX_LEGACY_COUNT
|
||||
1
|
||||
1
|
||||
1
|
||||
@@ -84,7 +88,7 @@ set(ROCTX_COUNT
|
||||
1
|
||||
)
|
||||
|
||||
set(ROCTX_DEPTH
|
||||
set(ROCTX_LEGACY_DEPTH
|
||||
1
|
||||
1
|
||||
2
|
||||
@@ -98,6 +102,57 @@ set(ROCTX_DEPTH
|
||||
1
|
||||
)
|
||||
|
||||
# Cached mode aggregates entries by name, so counts reflect total occurrences
|
||||
set(ROCTX_CACHED_LABEL
|
||||
roctxMark_GPU_workload
|
||||
roctxRangePush_HIP_Kernel
|
||||
roctxRangeStart_GPU_Compute
|
||||
roctxGetThreadId
|
||||
roctxMark_RoctxProfilerPause_End
|
||||
roctxMark_Thread_Start
|
||||
roctxMark_End
|
||||
roctxRangePush_run_profiling
|
||||
roctxMark_Finished_GPU
|
||||
)
|
||||
|
||||
set(ROCTX_CACHED_COUNT
|
||||
1
|
||||
2
|
||||
2
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
)
|
||||
|
||||
set(ROCTX_CACHED_DEPTH
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
2
|
||||
1
|
||||
1
|
||||
1
|
||||
)
|
||||
|
||||
# Determine which expectations to use based on trace mode in environment
|
||||
set(ROCTX_LABEL ${ROCTX_CACHED_LABEL})
|
||||
set(ROCTX_COUNT ${ROCTX_CACHED_COUNT})
|
||||
set(ROCTX_DEPTH ${ROCTX_CACHED_DEPTH})
|
||||
|
||||
# Check if ROCPROFSYS_TRACE_LEGACY=ON is set in the test environment
|
||||
list(FIND _roctx_environment "ROCPROFSYS_TRACE_LEGACY=ON" _legacy_idx)
|
||||
if(_legacy_idx GREATER -1)
|
||||
# Legacy mode is enabled, use legacy expectations
|
||||
set(ROCTX_LABEL ${ROCTX_LEGACY_LABEL})
|
||||
set(ROCTX_COUNT ${ROCTX_LEGACY_COUNT})
|
||||
set(ROCTX_DEPTH ${ROCTX_LEGACY_DEPTH})
|
||||
endif()
|
||||
|
||||
rocprofiler_systems_add_validation_test(
|
||||
NAME roctx-api-sampling
|
||||
PERFETTO_METRIC "rocm_marker_api"
|
||||
|
||||
@@ -103,7 +103,8 @@ endif()
|
||||
set(_test_openmp_env "OMP_PROC_BIND=spread" "OMP_PLACES=threads" "OMP_NUM_THREADS=2")
|
||||
|
||||
set(_base_environment
|
||||
"ROCPROFSYS_TRACE=ON"
|
||||
"ROCPROFSYS_TRACE_LEGACY=OFF"
|
||||
"ROCPROFSYS_TRACE_CACHED=ON"
|
||||
"ROCPROFSYS_PROFILE=ON"
|
||||
"ROCPROFSYS_USE_SAMPLING=ON"
|
||||
"ROCPROFSYS_USE_PROCESS_SAMPLING=ON"
|
||||
@@ -114,7 +115,8 @@ set(_base_environment
|
||||
)
|
||||
|
||||
set(_flat_environment
|
||||
"ROCPROFSYS_TRACE=ON"
|
||||
"ROCPROFSYS_TRACE_LEGACY=OFF"
|
||||
"ROCPROFSYS_TRACE_CACHED=ON"
|
||||
"ROCPROFSYS_PROFILE=ON"
|
||||
"ROCPROFSYS_TIME_OUTPUT=OFF"
|
||||
"ROCPROFSYS_COUT_OUTPUT=ON"
|
||||
@@ -144,7 +146,8 @@ set(_lock_environment
|
||||
)
|
||||
|
||||
set(_perfetto_environment
|
||||
"ROCPROFSYS_TRACE=ON"
|
||||
"ROCPROFSYS_TRACE_LEGACY=OFF"
|
||||
"ROCPROFSYS_TRACE_CACHED=ON"
|
||||
"ROCPROFSYS_PROFILE=OFF"
|
||||
"ROCPROFSYS_USE_SAMPLING=ON"
|
||||
"ROCPROFSYS_USE_PROCESS_SAMPLING=ON"
|
||||
@@ -156,7 +159,8 @@ set(_perfetto_environment
|
||||
)
|
||||
|
||||
set(_timemory_environment
|
||||
"ROCPROFSYS_TRACE=OFF"
|
||||
"ROCPROFSYS_TRACE_LEGACY=OFF"
|
||||
"ROCPROFSYS_TRACE_CACHED=OFF"
|
||||
"ROCPROFSYS_PROFILE=ON"
|
||||
"ROCPROFSYS_USE_SAMPLING=ON"
|
||||
"ROCPROFSYS_USE_PROCESS_SAMPLING=ON"
|
||||
@@ -177,7 +181,8 @@ set(_causal_environment
|
||||
)
|
||||
|
||||
set(_python_environment
|
||||
"ROCPROFSYS_TRACE=ON"
|
||||
"ROCPROFSYS_TRACE_LEGACY=OFF"
|
||||
"ROCPROFSYS_TRACE_CACHED=ON"
|
||||
"ROCPROFSYS_PROFILE=ON"
|
||||
"ROCPROFSYS_USE_SAMPLING=OFF"
|
||||
"ROCPROFSYS_USE_PROCESS_SAMPLING=ON"
|
||||
@@ -190,7 +195,8 @@ set(_python_environment
|
||||
)
|
||||
|
||||
set(_attach_environment
|
||||
"ROCPROFSYS_TRACE=ON"
|
||||
"ROCPROFSYS_TRACE_LEGACY=OFF"
|
||||
"ROCPROFSYS_TRACE_CACHED=ON"
|
||||
"ROCPROFSYS_PROFILE=ON"
|
||||
"ROCPROFSYS_USE_SAMPLING=OFF"
|
||||
"ROCPROFSYS_USE_PROCESS_SAMPLING=ON"
|
||||
@@ -204,7 +210,8 @@ set(_attach_environment
|
||||
)
|
||||
|
||||
set(_rccl_environment
|
||||
"ROCPROFSYS_TRACE=ON"
|
||||
"ROCPROFSYS_TRACE_LEGACY=OFF"
|
||||
"ROCPROFSYS_TRACE_CACHED=ON"
|
||||
"ROCPROFSYS_PROFILE=ON"
|
||||
"ROCPROFSYS_USE_SAMPLING=OFF"
|
||||
"ROCPROFSYS_USE_PROCESS_SAMPLING=ON"
|
||||
@@ -217,7 +224,8 @@ set(_rccl_environment
|
||||
)
|
||||
|
||||
set(_window_environment
|
||||
"ROCPROFSYS_TRACE=ON"
|
||||
"ROCPROFSYS_TRACE_LEGACY=OFF"
|
||||
"ROCPROFSYS_TRACE_CACHED=ON"
|
||||
"ROCPROFSYS_PROFILE=ON"
|
||||
"ROCPROFSYS_USE_SAMPLING=OFF"
|
||||
"ROCPROFSYS_USE_PROCESS_SAMPLING=OFF"
|
||||
@@ -1329,6 +1337,7 @@ function(ROCPROFILER_SYSTEMS_ADD_VALIDATION_TEST)
|
||||
)
|
||||
endif()
|
||||
|
||||
set(_EXIST_FILES_TESTS "")
|
||||
foreach(_FILE ${TEST_EXIST_FILES})
|
||||
add_test(
|
||||
NAME validate-${TEST_NAME}-${_FILE}-exists
|
||||
@@ -1337,6 +1346,7 @@ function(ROCPROFILER_SYSTEMS_ADD_VALIDATION_TEST)
|
||||
${PROJECT_BINARY_DIR}/rocprof-sys-tests-output/${TEST_NAME}/${_FILE}
|
||||
WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
|
||||
)
|
||||
list(APPEND _EXIST_FILES_TESTS "validate-${TEST_NAME}-${_FILE}-exists")
|
||||
endforeach()
|
||||
|
||||
if(TEST_TIMEMORY_FILE)
|
||||
@@ -1432,6 +1442,18 @@ function(ROCPROFILER_SYSTEMS_ADD_VALIDATION_TEST)
|
||||
${TEST_PROPERTIES}
|
||||
)
|
||||
endforeach()
|
||||
|
||||
# Set properties for file existence validation tests
|
||||
foreach(_TEST ${_EXIST_FILES_TESTS})
|
||||
set_tests_properties(
|
||||
${_TEST}
|
||||
PROPERTIES
|
||||
TIMEOUT ${TEST_TIMEOUT}
|
||||
LABELS "${TEST_LABELS}"
|
||||
DEPENDS "${TEST_DEPENDS};${TEST_NAME}"
|
||||
FIXTURES_REQUIRED "${_VALIDATION_FIXTURES}"
|
||||
)
|
||||
endforeach()
|
||||
endfunction()
|
||||
|
||||
# -------------------------------------------------------------------------------------- #
|
||||
@@ -1456,7 +1478,8 @@ function(ROCPROFILER_SYSTEMS_ADD_BIN_TEST)
|
||||
|
||||
if(NOT TEST_ENVIRONMENT)
|
||||
set(TEST_ENVIRONMENT
|
||||
"ROCPROFSYS_TRACE=ON"
|
||||
"ROCPROFSYS_TRACE_LEGACY=OFF"
|
||||
"ROCPROFSYS_TRACE_CACHED=ON"
|
||||
"ROCPROFSYS_PROFILE=ON"
|
||||
"ROCPROFSYS_USE_SAMPLING=ON"
|
||||
"ROCPROFSYS_TIME_OUTPUT=OFF"
|
||||
|
||||
@@ -32,7 +32,8 @@ endif()
|
||||
|
||||
set(_thread_limit_environment
|
||||
"${_base_environment}"
|
||||
"ROCPROFSYS_TRACE=ON"
|
||||
"ROCPROFSYS_TRACE_LEGACY=OFF"
|
||||
"ROCPROFSYS_TRACE_CACHED=ON"
|
||||
"ROCPROFSYS_PROFILE=ON"
|
||||
"ROCPROFSYS_COUT_OUTPUT=ON"
|
||||
"ROCPROFSYS_USE_SAMPLING=ON"
|
||||
|
||||
مرجع در شماره جدید
Block a user