diff --git a/projects/rocprofiler-systems/CHANGELOG.md b/projects/rocprofiler-systems/CHANGELOG.md index 7cdbb61f5b..1697e4b2a3 100644 --- a/projects/rocprofiler-systems/CHANGELOG.md +++ b/projects/rocprofiler-systems/CHANGELOG.md @@ -4,6 +4,25 @@ Full documentation for ROCm Systems Profiler is available at [https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/](https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/). +## ROCm Systems Profiler 1.4.0 for ROCm x.y.z (unreleased) + +### Added + +- Documentation for `ROCPROFSYS_TRACE_CACHED` configuration option and its performance benefits. +- Documentation for `--trace-legacy` / `-L` CLI flag for direct tracing mode. + +### Changed + +- `ROCPROFSYS_TRACE_CACHED` is now the default perfetto tracing mode for improved performance. +- Renamed `ROCPROFSYS_TRACE` to `ROCPROFSYS_TRACE_LEGACY` (with backward compatibility). +- `--trace` / `-T` CLI flag now uses cached mode by default. +- Added `--trace-legacy` / `-L` CLI flag for direct tracing mode. + +### Deprecated + +- `ROCPROFSYS_TRACE` environment variable (use `ROCPROFSYS_TRACE_LEGACY` for direct mode). +- `ROCPROFSYS_USE_PERFETTO` environment variable (use `ROCPROFSYS_TRACE_LEGACY`). + ## ROCm Systems Profiler 1.3.0 for ROCm 7.2.0 ### Added diff --git a/projects/rocprofiler-systems/docs/conceptual/data-collection-modes.rst b/projects/rocprofiler-systems/docs/conceptual/data-collection-modes.rst index 5154d494ea..65dc06ed69 100644 --- a/projects/rocprofiler-systems/docs/conceptual/data-collection-modes.rst +++ b/projects/rocprofiler-systems/docs/conceptual/data-collection-modes.rst @@ -176,7 +176,17 @@ Primary collection modes Trace mode (default) ^^^^^^^^^^^^^^^^^^^^^^^^ -Tracing mode generates comprehensive, deterministic traces of every event and measurement during application execution. This mode can be enabled using ``ROCPROFSYS_TRACE=true`` or ``ROCPROFSYS_MODE=trace`` setting. +Tracing mode generates comprehensive, deterministic traces of every event and measurement during application execution. This mode can be enabled using ``ROCPROFSYS_MODE=trace`` or by enabling one of the trace backend options. + +ROCm Systems Profiler provides two trace backend modes: + +- **Cached Mode (default, recommended)**: ``ROCPROFSYS_TRACE_CACHED=true`` or ``--trace`` / ``-T`` enables deferred trace generation with minimal runtime overhead. Trace data is buffered during execution and written after the application completes, significantly reducing performance impact during profiling. + +- **Legacy Mode**: ``ROCPROFSYS_TRACE_LEGACY=true`` or ``--trace-legacy`` / ``-L`` enables direct mode where trace data is written immediately during execution. This mode provides real-time trace generation but has higher runtime overhead compared to cached mode. + +.. note:: + + The ``ROCPROFSYS_TRACE`` environment variable is deprecated and has been renamed to ``ROCPROFSYS_TRACE_LEGACY``. For new workflows, use ``ROCPROFSYS_TRACE_CACHED`` (default) or ``ROCPROFSYS_TRACE_LEGACY`` explicitly. Additional configuration options to control the tracing behavior include: @@ -263,4 +273,4 @@ Granularity options: - Function-level: ``--coverage=function`` (``CODECOV_FUNCTION``) - Basic block-level: ``--coverage=basic_block`` (``CODECOV_BASIC_BLOCK``) -.. note:: Coverage mode disables several other features and all other modes to reduce overhead. \ No newline at end of file +.. note:: Coverage mode disables several other features and all other modes to reduce overhead. diff --git a/projects/rocprofiler-systems/docs/how-to/configuring-runtime-options.rst b/projects/rocprofiler-systems/docs/how-to/configuring-runtime-options.rst index fdf6799ab0..a2b614add1 100644 --- a/projects/rocprofiler-systems/docs/how-to/configuring-runtime-options.rst +++ b/projects/rocprofiler-systems/docs/how-to/configuring-runtime-options.rst @@ -34,7 +34,8 @@ and tweak the default sampling values. .. code-block:: shell # ... - ROCPROFSYS_TRACE = true + ROCPROFSYS_TRACE_CACHED = true # Recommended: deferred trace generation for minimal overhead + # ROCPROFSYS_TRACE_LEGACY = false # Alternative: direct mode with higher overhead ROCPROFSYS_PROFILE = true ROCPROFSYS_USE_SAMPLING = true ROCPROFSYS_USE_PROCESS_SAMPLING = true @@ -339,7 +340,8 @@ Generating a default configuration file ROCPROFSYS_CONFIG_FILE = ROCPROFSYS_MODE = trace - ROCPROFSYS_TRACE = true + ROCPROFSYS_TRACE_CACHED = true + ROCPROFSYS_TRACE_LEGACY = false ROCPROFSYS_PROFILE = false ROCPROFSYS_USE_SAMPLING = false ROCPROFSYS_USE_PROCESS_SAMPLING = true @@ -497,7 +499,9 @@ Viewing the setting descriptions | ROCPROFSYS_USE_CODE_COVERAGE | Enable support for code coverage | | ROCPROFSYS_USE_KOKKOSP | Enable support for Kokkos Tools | | ROCPROFSYS_USE_OMPT | Enable support for OpenMP-Tools | - | ROCPROFSYS_TRACE | Enable perfetto backend | + | ROCPROFSYS_TRACE_CACHED | Enable perfetto backend with deferred...| + | ROCPROFSYS_TRACE_LEGACY | Enable perfetto backend (legacy, dir... | + | ROCPROFSYS_TRACE | [DEPRECATED] Renamed to ROCPROFSYS_T... | | ROCPROFSYS_USE_PID | Enable tagging filenames with proces... | | ROCPROFSYS_USE_AMD_SMI | Enable sampling GPU power, temp, uti... | | ROCPROFSYS_USE_ROCM | Enable ROCM tracing | @@ -1345,7 +1349,8 @@ but do not override an existing value for the environment variable. $SAMPLE = OFF # use fields - ROCPROFSYS_TRACE = $ENABLE + ROCPROFSYS_TRACE_CACHED = $ENABLE # Recommended: deferred trace generation + ROCPROFSYS_TRACE_LEGACY = OFF # Legacy direct mode (higher overhead) ROCPROFSYS_PROFILE = $ENABLE ROCPROFSYS_USE_SAMPLING = $SAMPLE ROCPROFSYS_USE_PROCESS_SAMPLING = $SAMPLE diff --git a/projects/rocprofiler-systems/docs/how-to/nic-profiling.rst b/projects/rocprofiler-systems/docs/how-to/nic-profiling.rst index 8c7f0bfb29..0e91f5a0d2 100644 --- a/projects/rocprofiler-systems/docs/how-to/nic-profiling.rst +++ b/projects/rocprofiler-systems/docs/how-to/nic-profiling.rst @@ -75,7 +75,7 @@ The configuration parameter settings can be saved in a configuration file. Here ROCPROFSYS_SAMPLING_DELAY=0.05 ROCPROFSYS_SAMPLING_CPUS=0-9 ROCPROFSYS_SAMPLING_GPUS=$env:HIP_VISIBLE_DEVICES - ROCPROFSYS_TRACE=ON + ROCPROFSYS_TRACE_CACHED=ON ROCPROFSYS_PROFILE=ON ROCPROFSYS_USE_SAMPLING=ON ROCPROFSYS_USE_PROCESS_SAMPLING=OFF diff --git a/projects/rocprofiler-systems/docs/how-to/sampling-call-stack.rst b/projects/rocprofiler-systems/docs/how-to/sampling-call-stack.rst index 3b0cd6ebaa..97d01da86f 100644 --- a/projects/rocprofiler-systems/docs/how-to/sampling-call-stack.rst +++ b/projects/rocprofiler-systems/docs/how-to/sampling-call-stack.rst @@ -284,7 +284,7 @@ The following snippets show how ``rocprof-sys-sample`` runs with various environ LD_PRELOAD=/opt/rocprofiler-systems/lib/librocprof-sys-dl.so.1.7.1 ROCPROFSYS_CPU_FREQ_ENABLED=false ROCPROFSYS_PROFILE=true - ROCPROFSYS_TRACE=true + ROCPROFSYS_TRACE_CACHED=true ROCPROFSYS_USE_AMD_SMI=true ROCPROFSYS_USE_PROCESS_SAMPLING=true ROCPROFSYS_USE_SAMPLING=true @@ -307,7 +307,7 @@ The following snippets show how ``rocprof-sys-sample`` runs with various environ ROCPROFSYS_USE_KOKKOSP=true ROCPROFSYS_USE_MPIP=true ROCPROFSYS_USE_OMPT=true - ROCPROFSYS_TRACE=true + ROCPROFSYS_TRACE_CACHED=true ROCPROFSYS_USE_PROCESS_SAMPLING=true ROCPROFSYS_USE_RCCLP=true ROCPROFSYS_USE_AMD_SMI=true @@ -337,7 +337,7 @@ The following snippets show how ``rocprof-sys-sample`` runs with various environ ROCPROFSYS_USE_KOKKOSP=false ROCPROFSYS_USE_MPIP=false ROCPROFSYS_USE_OMPT=false - ROCPROFSYS_TRACE=true + ROCPROFSYS_TRACE_CACHED=true ROCPROFSYS_USE_PROCESS_SAMPLING=true ROCPROFSYS_USE_RCCLP=false ROCPROFSYS_USE_AMD_SMI=false @@ -362,7 +362,7 @@ Here is the full output from the previous ROCPROFSYS_OUTPUT_PATH=rocprof-sys-output ROCPROFSYS_OUTPUT_PREFIX=%tag% ROCPROFSYS_PROFILE=true - ROCPROFSYS_TRACE=true + ROCPROFSYS_TRACE_CACHED=true ROCPROFSYS_TRACE_THREAD_LOCKS=false ROCPROFSYS_TRACE_THREAD_RW_LOCKS=false ROCPROFSYS_TRACE_THREAD_SPIN_LOCKS=false diff --git a/projects/rocprofiler-systems/docs/how-to/understanding-rocprof-sys-output.rst b/projects/rocprofiler-systems/docs/how-to/understanding-rocprof-sys-output.rst index 1f77e74e6b..a7d9d85f7a 100644 --- a/projects/rocprofiler-systems/docs/how-to/understanding-rocprof-sys-output.rst +++ b/projects/rocprofiler-systems/docs/how-to/understanding-rocprof-sys-output.rst @@ -17,7 +17,7 @@ For example, starting with the following base configuration: export ROCPROFSYS_TIME_OUTPUT=ON export ROCPROFSYS_USE_PID=OFF export ROCPROFSYS_PROFILE=ON - export ROCPROFSYS_TRACE=ON + export ROCPROFSYS_TRACE_CACHED=ON .. code-block:: shell diff --git a/projects/rocprofiler-systems/docs/how-to/using-rocprof-sys-api.rst b/projects/rocprofiler-systems/docs/how-to/using-rocprof-sys-api.rst index 639f87cd78..1d491ef78c 100644 --- a/projects/rocprofiler-systems/docs/how-to/using-rocprof-sys-api.rst +++ b/projects/rocprofiler-systems/docs/how-to/using-rocprof-sys-api.rst @@ -192,7 +192,7 @@ First, instrument and run the program. ROCPROFSYS: LD_PRELOAD=/home/rocm-dev/code/rocprofiler-systems/build/ubuntu/22.04/lib/librocprof-sys-dl.so.1.0.0 ROCPROFSYS: OMP_TOOL_LIBRARIES=/home/rocm-dev/code/rocprofiler-systems/build/ubuntu/22.04/lib/librocprof-sys-dl.so.1.0.0 ROCPROFSYS: ROCPROFSYS_PROFILE=true - ROCPROFSYS: ROCPROFSYS_TRACE=true + ROCPROFSYS: ROCPROFSYS_TRACE_CACHED=true ROCPROFSYS: ROCPROFSYS_VERBOSE=0 [rocprof-sys][dl][1827155] rocprofsys_main [rocprof-sys][1827155][rocprofsys_init_tooling] Instrumentation mode: Trace diff --git a/projects/rocprofiler-systems/examples/openmp/external/CMakeLists.txt b/projects/rocprofiler-systems/examples/openmp/external/CMakeLists.txt index 09a54f5b17..5c184187a8 100644 --- a/projects/rocprofiler-systems/examples/openmp/external/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/openmp/external/CMakeLists.txt @@ -88,8 +88,7 @@ function(configure_ompvv_tests TEST_TYPE TEST_LIST_VAR) NUM_THREADS_DEVICE=${OMPVV_NUM_THREADS_DEVICE} NUM_TEAMS_DEVICE=${OMPVV_NUM_TEAMS_DEVICE} "FOFFLOADING=${FOFFLOADING_FLAGS}" "FFLAGS=${CUSTOM_FFLAGS}" - "FLINKFLAGS=${CUSTOM_FLINKFLAGS}" "SOURCES=${TEST}" compile > - ${CMAKE_CURRENT_BINARY_DIR}/ompvv-compile-${TARGET_NAME}.log 2>&1 + "FLINKFLAGS=${CUSTOM_FLINKFLAGS}" "SOURCES=${TEST}" compile COMMAND ${CMAKE_COMMAND} -E copy_if_different "${ROCPROFSYS_OMPVV_SOURCE_DIR}/bin/${TEST_NAME}.F90.o" @@ -223,7 +222,7 @@ endif() set(OMPVV_OPENMP_VERSION 5.0) set(OMPVV_TDIR "tests/${OMPVV_OPENMP_VERSION}") -set(OMPVV_NUM_THREADS_HOST 128) # Smallest possible value of ROCPROFSYS_THREAD_COUNT. Avoids very long test times +set(OMPVV_NUM_THREADS_HOST 8) # The default value from ompvv.F90 set(OMPVV_NUM_TEAMS_DEVICE 8) # default used by ompvv set(OMPVV_NUM_THREADS_DEVICE 8) # default used by ompvv diff --git a/projects/rocprofiler-systems/source/bin/rocprof-sys-avail/generate_config.cpp b/projects/rocprofiler-systems/source/bin/rocprof-sys-avail/generate_config.cpp index ecce26230f..68901a6503 100644 --- a/projects/rocprofiler-systems/source/bin/rocprof-sys-avail/generate_config.cpp +++ b/projects/rocprofiler-systems/source/bin/rocprof-sys-avail/generate_config.cpp @@ -340,10 +340,11 @@ generate_config(std::string _config_file, const std::set& _config_f if(_romni && !_lomni) return false; for(const auto* itr : { "ROCPROFSYS_CONFIG", "ROCPROFSYS_MODE", "ROCPROFSYS_TRACE_CACHED", - "ROCPROFSYS_TRACE", "ROCPROFSYS_PROFILE", "ROCPROFSYS_USE_SAMPLING", - "ROCPROFSYS_USE_PROCESS_SAMPLING", "ROCPROFSYS_USE_ROCM", - "ROCPROFSYS_USE_AMD_SMI", "ROCPROFSYS_USE_KOKKOSP", - "ROCPROFSYS_USE_OMPT", "ROCPROFSYS_USE", "ROCPROFSYS_OUTPUT" }) + "ROCPROFSYS_TRACE_LEGACY", "ROCPROFSYS_PROFILE", + "ROCPROFSYS_USE_SAMPLING", "ROCPROFSYS_USE_PROCESS_SAMPLING", + "ROCPROFSYS_USE_ROCM", "ROCPROFSYS_USE_AMD_SMI", + "ROCPROFSYS_USE_KOKKOSP", "ROCPROFSYS_USE_OMPT", "ROCPROFSYS_USE", + "ROCPROFSYS_OUTPUT" }) { if(_lhs->get_env_name().find(itr) == 0 && _rhs->get_env_name().find(itr) != 0) diff --git a/projects/rocprofiler-systems/source/bin/rocprof-sys-causal/impl.cpp b/projects/rocprofiler-systems/source/bin/rocprof-sys-causal/impl.cpp index 6ca6ce5530..b68ec8ffef 100644 --- a/projects/rocprofiler-systems/source/bin/rocprof-sys-causal/impl.cpp +++ b/projects/rocprofiler-systems/source/bin/rocprof-sys-causal/impl.cpp @@ -189,7 +189,7 @@ get_initial_environment() update_env(_env, "ROCPROFSYS_TRACE_CACHED", false); update_env(_env, "ROCPROFSYS_PROFILE", false); update_env(_env, "ROCPROFSYS_USE_PROCESS_SAMPLING", false); - update_env(_env, "ROCPROFSYS_TRACE", false); + update_env(_env, "ROCPROFSYS_TRACE_LEGACY", false); update_env(_env, "ROCPROFSYS_THREAD_POOL_SIZE", get_env("ROCPROFSYS_THREAD_POOL_SIZE", 0)); update_env(_env, "ROCPROFSYS_LAUNCHER", "rocprof-sys-causal"); diff --git a/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/impl.cpp b/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/impl.cpp index 12894a1b79..0614c38b60 100644 --- a/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/impl.cpp +++ b/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/impl.cpp @@ -339,20 +339,23 @@ parse_args(int argc, char** argv, std::vector& _env) original_envs); }); parser - .add_argument({ "-T", "--trace" }, "Generate a detailed trace (perfetto output)") + .add_argument( + { "-T", "--trace" }, + "Generate a detailed trace with deferred trace generation (perfetto output)") .max_count(1) .action([&](parser_t& p) { - rocprofsys::common::update_env(_env, "ROCPROFSYS_TRACE", p.get("trace"), - update_mode::REPLACE, ":", updated_envs, - original_envs); + rocprofsys::common::update_env(_env, "ROCPROFSYS_TRACE_CACHED", + p.get("trace"), update_mode::REPLACE, + ":", updated_envs, original_envs); }); parser - .add_argument({ "--trace-cached" }, - "Generate a detailed trace (perfetto output) from cached data ") + .add_argument( + { "-L", "--trace-legacy" }, + "Generate a detailed trace with direct mode (perfetto output, legacy)") .max_count(1) .action([&](parser_t& p) { rocprofsys::common::update_env( - _env, "ROCPROFSYS_TRACE_CACHED", p.get("trace-cached"), + _env, "ROCPROFSYS_TRACE_LEGACY", p.get("trace-legacy"), update_mode::REPLACE, ":", updated_envs, original_envs); }); parser diff --git a/projects/rocprofiler-systems/source/lib/core/argparse.cpp b/projects/rocprofiler-systems/source/lib/core/argparse.cpp index 168bcf8a40..1e511c6604 100644 --- a/projects/rocprofiler-systems/source/lib/core/argparse.cpp +++ b/projects/rocprofiler-systems/source/lib/core/argparse.cpp @@ -301,14 +301,24 @@ add_core_arguments(parser_t& _parser, parser_data& _data) if(_data.environ_filter("trace", _data)) { _parser - .add_argument({ "-T", "--trace" }, - "Generate a detailed trace (perfetto output)") + .add_argument({ "-T", "--trace" }, "Generate a detailed trace with deferred " + "trace generation (perfetto output)") .max_count(1) .action([&](parser_t& p) { - update_env(_data, "ROCPROFSYS_TRACE", p.get("trace")); + update_env(_data, "ROCPROFSYS_TRACE_CACHED", p.get("trace")); + }); + + _parser + .add_argument( + { "-L", "--trace-legacy" }, + "Generate a detailed trace with direct mode (perfetto output, legacy)") + .max_count(1) + .action([&](parser_t& p) { + update_env(_data, "ROCPROFSYS_TRACE_LEGACY", p.get("trace-legacy")); }); _data.processed_environs.emplace("trace"); + _data.processed_environs.emplace("trace_legacy"); } if(_data.environ_filter("profile", _data)) diff --git a/projects/rocprofiler-systems/source/lib/core/config.cpp b/projects/rocprofiler-systems/source/lib/core/config.cpp index 7e5808f681..3441a99f56 100644 --- a/projects/rocprofiler-systems/source/lib/core/config.cpp +++ b/projects/rocprofiler-systems/source/lib/core/config.cpp @@ -299,20 +299,32 @@ configure_settings(bool _init) get_env("ROCPROFSYS_NUM_THREADS", 1), "threading", "performance", "sampling", "parallelism", "advanced"); - ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_TRACE", "Enable perfetto backend", - _default_perfetto_v, "backend", "perfetto"); + ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_TRACE_CACHED", + "Enable perfetto backend with deferred trace generation " + "for minimal runtime overhead", + _default_perfetto_v, "backend", "perfetto_caching"); + + ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_TRACE_LEGACY", + "Enable perfetto backend (legacy, direct mode)", false, + "backend", "perfetto"); + + ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_TRACE", + "[DEPRECATED] Renamed to ROCPROFSYS_TRACE_LEGACY", false, + "backend", "perfetto", "deprecated"); ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_USE_PERFETTO", - "[DEPRECATED] Renamed to ROCPROFSYS_TRACE", - _default_perfetto_v, "backend", "perfetto", "deprecated"); + "[DEPRECATED] Renamed to ROCPROFSYS_TRACE_LEGACY", false, + "backend", "perfetto", "deprecated"); ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_PROFILE", "Enable timemory backend", - !_config->get("ROCPROFSYS_TRACE"), "backend", - "timemory"); + !(_config->get("ROCPROFSYS_TRACE_LEGACY") || + _config->get("ROCPROFSYS_TRACE_CACHED")), + "backend", "timemory"); - ROCPROFSYS_CONFIG_SETTING( - bool, "ROCPROFSYS_USE_TIMEMORY", "[DEPRECATED] Renamed to ROCPROFSYS_PROFILE", - !_config->get("ROCPROFSYS_TRACE"), "backend", "timemory", "deprecated"); + ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_USE_TIMEMORY", + "[DEPRECATED] Renamed to ROCPROFSYS_PROFILE", + !_config->get("ROCPROFSYS_TRACE_LEGACY"), "backend", + "timemory", "deprecated"); ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_USE_CAUSAL", "Enable causal profiling analysis", false, "backend", @@ -321,10 +333,6 @@ configure_settings(bool _init) ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_USE_ROCPD", "Enable rocpd backend", false, "backend", "rocpd"); - ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_TRACE_CACHED", - "Enable perfetto with trace cache", false, "backend", - "perfetto_caching"); - ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_USE_ROCM", "Enable ROCm API and kernel tracing", true, "backend", "rocm"); @@ -1060,7 +1068,8 @@ configure_settings(bool _init) handle_deprecated_setting("ROCPROFSYS_USE_THREAD_SAMPLING", "ROCPROFSYS_USE_PROCESS_SAMPLING"); handle_deprecated_setting("ROCPROFSYS_OUTPUT_FILE", "ROCPROFSYS_PERFETTO_FILE"); - handle_deprecated_setting("ROCPROFSYS_USE_PERFETTO", "ROCPROFSYS_TRACE"); + handle_deprecated_setting("ROCPROFSYS_USE_PERFETTO", "ROCPROFSYS_TRACE_LEGACY"); + handle_deprecated_setting("ROCPROFSYS_TRACE", "ROCPROFSYS_TRACE_LEGACY"); handle_deprecated_setting("ROCPROFSYS_USE_TIMEMORY", "ROCPROFSYS_PROFILE"); scope::get_fields()[scope::flat::value] = _config->get_flat_profile(); @@ -1127,7 +1136,8 @@ configure_mode_settings(const std::shared_ptr& _config) if(get_mode() == Mode::Coverage) { set_default_setting_value("ROCPROFSYS_USE_CODE_COVERAGE", true); - _set("ROCPROFSYS_TRACE", false); + _set("ROCPROFSYS_TRACE_LEGACY", false); + _set("ROCPROFSYS_TRACE_CACHED", false); _set("ROCPROFSYS_PROFILE", false); _set("ROCPROFSYS_USE_CAUSAL", false); _set("ROCPROFSYS_USE_AMD_SMI", false); @@ -1140,7 +1150,8 @@ configure_mode_settings(const std::shared_ptr& _config) else if(get_mode() == Mode::Causal) { _set("ROCPROFSYS_USE_CAUSAL", true); - _set("ROCPROFSYS_TRACE", false); + _set("ROCPROFSYS_TRACE_LEGACY", false); + _set("ROCPROFSYS_TRACE_CACHED", false); _set("ROCPROFSYS_PROFILE", false); _set("ROCPROFSYS_USE_SAMPLING", false); _set("ROCPROFSYS_USE_PROCESS_SAMPLING", false); @@ -1836,7 +1847,7 @@ get_verbose() bool& get_use_perfetto() { - static auto _v = get_config()->at("ROCPROFSYS_TRACE"); + static auto _v = get_config()->at("ROCPROFSYS_TRACE_LEGACY"); return static_cast&>(*_v).get(); } @@ -2144,11 +2155,12 @@ get_perfetto_backend() std::string get_perfetto_output_filename() { - static auto _v = get_config()->find("ROCPROFSYS_PERFETTO_FILE"); - auto _val = static_cast&>(*_v->second).get(); - auto _pos_dir = _val.find_last_of('/'); - auto _dir = std::string{}; - auto _ext = std::string{ "proto" }; + static auto _v = get_config()->find("ROCPROFSYS_PERFETTO_FILE"); + auto _val = static_cast&>(*_v->second).get(); + + auto _pos_dir = _val.find_last_of('/'); + auto _dir = std::string{}; + auto _ext = std::string{ "proto" }; if(_pos_dir != std::string::npos) { _dir = _val.substr(0, _pos_dir + 1); @@ -2161,12 +2173,38 @@ get_perfetto_output_filename() _val = _val.substr(0, _pos_ext); } + ROCPROFSYS_BASIC_VERBOSE_F( + 2, "[get_perfetto_output_filename] Parsed: dir='%s', basename='%s', ext='%s'\n", + _dir.c_str(), _val.c_str(), _ext.c_str()); + ROCPROFSYS_BASIC_VERBOSE_F( + 2, "[get_perfetto_output_filename] settings::output_path()='%s'\n", + settings::output_path().c_str()); + ROCPROFSYS_BASIC_VERBOSE_F( + 2, "[get_perfetto_output_filename] settings::output_prefix()='%s'\n", + settings::output_prefix().c_str()); + auto _cfg = settings::compose_filename_config{ settings::use_output_suffix(), settings::default_process_suffix(), false, _dir }; _val = settings::compose_output_filename(_val, _ext, _cfg); + + ROCPROFSYS_BASIC_VERBOSE_F( + 2, "[get_perfetto_output_filename] After compose_output_filename: '%s'\n", + _val.c_str()); + if(!_val.empty() && _val.at(0) != '/') - return settings::format(JOIN('/', "%env{PWD}%", _val), get_config()->get_tag()); + { + auto _result = + settings::format(JOIN('/', "%env{PWD}%", _val), get_config()->get_tag()); + ROCPROFSYS_BASIC_VERBOSE_F( + 2, "[get_perfetto_output_filename] Path is relative, prepending PWD: '%s'\n", + _result.c_str()); + return _result; + } + + ROCPROFSYS_BASIC_VERBOSE_F( + 2, "[get_perfetto_output_filename] Path is absolute, returning: '%s'\n", + _val.c_str()); return _val; } @@ -2426,8 +2464,20 @@ get_perfetto_output_filename_with_suffix(std::string_view suffix) static auto _v = get_config()->find("ROCPROFSYS_PERFETTO_FILE"); auto _val = static_cast&>(*_v->second).get(); + ROCPROFSYS_BASIC_VERBOSE_F(2, + "[get_perfetto_output_filename_with_suffix] Initial " + "ROCPROFSYS_PERFETTO_FILE='%s', suffix='%s'\n", + _val.c_str(), std::string{ suffix }.c_str()); + // If absolute path is provided, return it as-is - if(!_val.empty() && _val.at(0) == '/') return _val; + if(!_val.empty() && _val.at(0) == '/') + { + ROCPROFSYS_BASIC_VERBOSE_F( + 2, + "[get_perfetto_output_filename_with_suffix] Absolute path, returning: '%s'\n", + _val.c_str()); + return _val; + } auto _pos_dir = _val.find_last_of('/'); auto _dir = std::string{}; @@ -2451,6 +2501,15 @@ get_perfetto_output_filename_with_suffix(std::string_view suffix) bool _explicitly_set = (_v->second->get_environ_updated() || _v->second->get_config_updated()); + ROCPROFSYS_BASIC_VERBOSE_F( + 2, + "[get_perfetto_output_filename_with_suffix] Parsed: dir='%s', basename='%s', " + "ext='%s', explicitly_set=%s\n", + _dir.c_str(), _val.c_str(), _ext.c_str(), _explicitly_set ? "true" : "false"); + ROCPROFSYS_BASIC_VERBOSE_F( + 2, "[get_perfetto_output_filename_with_suffix] settings::output_path()='%s'\n", + settings::output_path().c_str()); + auto _cfg = settings::compose_filename_config{ !_explicitly_set && !suffix.empty(), // use_suffix only if not explicitly set suffix, // suffix value @@ -2459,9 +2518,27 @@ get_perfetto_output_filename_with_suffix(std::string_view suffix) }; _val = settings::compose_output_filename(_val, _ext, _cfg); - if(!_val.empty() && _val.at(0) != '/') - return settings::format(JOIN('/', "%env{PWD}%", _val), get_config()->get_tag()); + ROCPROFSYS_BASIC_VERBOSE_F(2, + "[get_perfetto_output_filename_with_suffix] After " + "compose_output_filename: '%s'\n", + _val.c_str()); + + if(!_val.empty() && _val.at(0) != '/') + { + auto _result = + settings::format(JOIN('/', "%env{PWD}%", _val), get_config()->get_tag()); + ROCPROFSYS_BASIC_VERBOSE_F(2, + "[get_perfetto_output_filename_with_suffix] Path is " + "relative, prepending PWD: '%s'\n", + _result.c_str()); + return _result; + } + + ROCPROFSYS_BASIC_VERBOSE_F( + 2, + "[get_perfetto_output_filename_with_suffix] Path is absolute, returning: '%s'\n", + _val.c_str()); return _val; } diff --git a/projects/rocprofiler-systems/source/lib/core/perfetto.cpp b/projects/rocprofiler-systems/source/lib/core/perfetto.cpp index 800f0a2978..17616dc114 100644 --- a/projects/rocprofiler-systems/source/lib/core/perfetto.cpp +++ b/projects/rocprofiler-systems/source/lib/core/perfetto.cpp @@ -243,6 +243,7 @@ post_process(tim::manager* _timemory_manager, bool& _perfetto_output_error) #endif auto _filename = config::get_perfetto_output_filename(); + if(!trace_data.empty()) { operation::file_output_message _fom{}; diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.cpp b/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.cpp index dc7ab7d804..f94933986c 100644 --- a/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.cpp +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.cpp @@ -668,9 +668,40 @@ perfetto_processor_t::handle(const region_sample& _rs) annotate_perfetto(ctx, annotations); }; - tracing::push_perfetto_ts(category::rocm{}, _name.c_str(), _beg_ts, - ::perfetto::Flow::ProcessScoped(_corr_id), add_annotations); - tracing::pop_perfetto_ts(category::rocm{}, _name.c_str(), _end_ts); + auto emit_trace = [&](auto category_tag) { + using CategoryT = decltype(category_tag); + tracing::push_perfetto_ts(CategoryT{}, _name.c_str(), _beg_ts, + ::perfetto::Flow::ProcessScoped(_corr_id), + add_annotations); + tracing::pop_perfetto_ts(CategoryT{}, _name.c_str(), _end_ts); + }; + + auto try_category = [&](auto category_tag) { + using CategoryT = decltype(category_tag); + if(_category == trait::name::value) + { + emit_trace(category_tag); + return true; + } + return false; + }; + + bool dispatched = + (try_category(category::host{}) || try_category(category::user{}) || + try_category(category::python{}) || try_category(category::mpi{}) || + try_category(category::pthread{}) || try_category(category::kokkos{}) || + try_category(category::rocm_hip_api{}) || + try_category(category::rocm_hsa_api{}) || + try_category(category::rocm_marker_api{}) || + try_category(category::rocm_rccl{}) || + try_category(category::rocm_rocdecode_api{}) || + try_category(category::rocm_rocjpeg_api{}) || try_category(category::vaapi{})); + + if(!dispatched) + { + // Default to rocm category for backward compatibility + emit_trace(category::rocm{}); + } } void @@ -894,32 +925,6 @@ perfetto_processor_t::handle([[maybe_unused]] const pmc_event_with_sample& _pmc) void perfetto_processor_t::handle([[maybe_unused]] const amd_smi_sample& _amd_smi) { - // using amd_smi_gfx_track = perfetto_counter_track; - // using amd_smi_umc_track = perfetto_counter_track; - // using amd_smi_mm_track = perfetto_counter_track; - // using amd_smi_temp_track = perfetto_counter_track; - // using amd_smi_power_track = perfetto_counter_track; - // using amd_smi_mem_track = perfetto_counter_track; - // using amd_smi_vcn_track = perfetto_counter_track; - // using amd_smi_jpeg_track = - // perfetto_counter_track; using - // amd_smi_xgmi_link_width_track = - // perfetto_counter_track; - // using amd_smi_xgmi_link_speed_track = - // perfetto_counter_track; - // using amd_smi_xgmi_read_track = - // perfetto_counter_track; - // using amd_smi_xgmi_write_track = - // perfetto_counter_track; - // using amd_smi_pcie_link_width_track = - // perfetto_counter_track; - // using amd_smi_pcie_link_speed_track = - // perfetto_counter_track; - // using amd_smi_pcie_bandwidth_acc_track = - // perfetto_counter_track; - // using amd_smi_pcie_bandwidth_inst_track = - // perfetto_counter_track; - // Use the shared gpu_metrics_t from core/gpu_metrics.hpp using gpu_metrics_t = gpu::gpu_metrics_t; @@ -937,36 +942,6 @@ perfetto_processor_t::handle([[maybe_unused]] const amd_smi_sample& _amd_smi) auto _ts = _amd_smi.timestamp; auto _device_id = _amd_smi.device_id; - // auto setup_tracks = [&]() { - // if(amd_smi_gfx_track::exists(_device_id)) return; - - // auto make_track_name = [&](const char* metric) { - // return JOIN(" ", "GPU", JOIN("", '[', _device_id, ']'), metric, "(S)"); - // }; - - // if(is_busy_enabled) - // { - // amd_smi_gfx_track::emplace(_device_id, make_track_name("GFX Busy"), "%"); - // amd_smi_umc_track::emplace(_device_id, make_track_name("UMC Busy"), "%"); - // amd_smi_mm_track::emplace(_device_id, make_track_name("MM Busy"), "%"); - // } - // if(is_temp_enabled) - // { - // amd_smi_temp_track::emplace(_device_id, make_track_name("Temperature"), - // "deg C"); - // } - // if(is_power_enabled) - // { - // amd_smi_power_track::emplace(_device_id, make_track_name("Power"), "W"); - // } - // if(is_mem_usage_enabled) - // { - // amd_smi_mem_track::emplace(_device_id, make_track_name("Memory Usage"), - // "MB"); - // } - // }; - - // setup_tracks(); setup_amd_smi_tracks(_device_id, is_busy_enabled, is_temp_enabled, is_power_enabled, is_mem_usage_enabled); @@ -1013,6 +988,14 @@ perfetto_processor_t::handle([[maybe_unused]] const amd_smi_sample& _amd_smi) using Category = std::decay_t; + const char* metric_name = nullptr; + if constexpr(std::is_same_v) + metric_name = "VCN Activity"; + else if constexpr(std::is_same_v) + metric_name = "JPEG Activity"; + else + metric_name = trait::name::value; + for(size_t i = 0; i < data.size(); ++i) { const auto value = data[i]; @@ -1023,16 +1006,14 @@ perfetto_processor_t::handle([[maybe_unused]] const amd_smi_sample& _amd_smi) { // Per-XCP format track_name = JOIN( - " ", "GPU", JOIN("", '[', _device_id, ']'), - trait::name::value, + " ", "GPU", JOIN("", '[', _device_id, ']'), metric_name, JOIN("", "XCP_", _idx.value(), ": [", (i < 10 ? "0" : ""), i, ']'), "(S)"); } else { // Device-level format - track_name = JOIN(" ", "GPU", JOIN("", '[', _device_id, ']'), - trait::name::value, + track_name = JOIN(" ", "GPU", JOIN("", '[', _device_id, ']'), metric_name, JOIN("", "[", (i < 10 ? "0" : ""), i, ']'), "(S)"); } diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/api.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/api.hpp index 478f96ec58..ee42562e4e 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/api.hpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/api.hpp @@ -103,6 +103,7 @@ extern "C" void rocprofsys_set_mpi_hidden(bool, bool) ROCPROFSYS_HIDDEN_API; void rocprofsys_push_trace_hidden(const char*) ROCPROFSYS_HIDDEN_API; void rocprofsys_pop_trace_hidden(const char*) ROCPROFSYS_HIDDEN_API; + void rocprofsys_flush_pending_region_cache_hidden() ROCPROFSYS_HIDDEN_API; void rocprofsys_push_region_hidden(const char*) ROCPROFSYS_HIDDEN_API; void rocprofsys_pop_region_hidden(const char*) ROCPROFSYS_HIDDEN_API; void rocprofsys_push_category_region_hidden(rocprofsys_category_t, const char*, diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library.cpp index 74e5056223..695a56957d 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library.cpp @@ -899,11 +899,12 @@ rocprofsys_finalize_hidden(void) #endif ROCPROFSYS_DEBUG_F("Stopping and destroying instrumentation bundles...\n"); - for(size_t i = 0; i < thread_info::get_peak_num_threads(); ++i) + auto* _bundles = instrumentation_bundles::get(); + for(size_t i = 0; _bundles && i < thread_info::get_peak_num_threads(); ++i) { - if(!instrumentation_bundles::get()) continue; + if(i >= _bundles->size()) continue; const auto& _info = thread_info::get(i, SequentTID); - auto& itr = instrumentation_bundles::get()->at(i); + auto& itr = _bundles->at(i); while(itr != nullptr && !itr->empty()) { int _lvl = 1; @@ -1026,6 +1027,11 @@ rocprofsys_finalize_hidden(void) tracing::copy_timemory_hash_ids(); + // Flush any pending region cache entries (e.g., main entry point that wasn't + // explicitly stopped before finalization) + ROCPROFSYS_DEBUG_F("Flushing pending region cache entries...\n"); + rocprofsys_flush_pending_region_cache_hidden(); + bool _perfetto_output_error = false; if(get_use_perfetto()) { diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/category_region.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/category_region.hpp index 5c0e2a09a1..a54d064015 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/category_region.hpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/category_region.hpp @@ -120,6 +120,31 @@ cache_stop(const char* name) rocprofsys::trait::name::value); } } + +/// Flush all pending cached entries for this thread. +/// Called during finalization to ensure entries that were started but not stopped +/// (e.g., main entry point) are written to the trace cache. +inline void +flush_pending_cached_entries() +{ + const auto end_ts = static_cast(rocprofsys::comp::wall_clock::record()); + uint64_t thread_id = 0; + + const auto& extended_info = rocprofsys::thread_info::get(std::this_thread::get_id()); + if(extended_info.has_value() && extended_info->index_data.has_value()) + { + constexpr size_t UNKNOWN_TIME = 0; + thread_id = extended_info->index_data->system_value; + rocprofsys::trace_cache::get_metadata_registry().add_thread_info( + { getppid(), getpid(), thread_id, UNKNOWN_TIME, UNKNOWN_TIME, "{}" }); + } + + for(const auto& [key, start_ts] : map_name_to_args) + { + cache_region(thread_id, key.name, start_ts, end_ts, key.category); + } + map_name_to_args.clear(); +} } // namespace namespace tim diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp index 1e4569fe6d..9471bd0321 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp @@ -533,12 +533,22 @@ void cache_region(const rocprofiler_callback_tracing_record_t* record, const rocprofiler_timestamp_t start_timestamp, const rocprofiler_timestamp_t end_timestamp, const std::string& call_stack, - const std::string& args_str, const std::string& category) + const std::string& args_str, const std::string& category, + std::string_view name = {}) { - auto callback_tracing_info = - trace_cache::get_metadata_registry().get_callback_tracing_info(); - auto _name = std::string{ callback_tracing_info.at(record->kind, record->operation) }; + // Use provided name if available, otherwise fall back to API operation name + std::string _name; + if(name.empty()) + { + auto callback_tracing_info = + trace_cache::get_metadata_registry().get_callback_tracing_info(); + _name = std::string{ callback_tracing_info.at(record->kind, record->operation) }; + } + else + { + _name = std::string{ name }; + } trace_cache::get_buffer_storage().store(trace_cache::region_sample{ record->thread_id, _name.c_str(), record->correlation_id.internal, @@ -814,7 +824,7 @@ tool_tracing_callback_stop( cache_add_thread_info(record.thread_id); std::string args_str = get_args_string(args); cache_region(&record, _beg_ts, _end_ts, call_stack.dump(), args_str, - trait::name::value); + trait::name::value, _name); } } diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_data.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_data.hpp index 9c1e005f5b..4a77e8ed99 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_data.hpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_data.hpp @@ -31,6 +31,7 @@ #include "core/state.hpp" #include "core/timemory.hpp" #include "core/utility.hpp" +#include "library/thread_data_growth.hpp" #include "library/thread_deleter.hpp" #include @@ -54,15 +55,6 @@ using instrumentation_bundle_t = // allocator for instrumentation_bundle_t using bundle_allocator_t = tim::data::ring_buffer_allocator; -using grow_functor_t = int64_t (*)(int64_t); - -inline auto& -grow_functors() -{ - static auto _v = container::stable_vector{}; - return _v; -} - template struct base_thread_data { @@ -77,7 +69,16 @@ struct base_thread_data } return (_v) ? _v->capacity() : 0; }; - grow_functors().emplace_back(std::move(_func)); + grow_functors().emplace_back(_func); + + // Immediately sync this container to current peak_num_threads. + // This ensures containers instantiated after threads exceed + // max_supported_threads are properly sized. + auto _current_peak = get_current_peak_num_threads(); + if(_current_peak > static_cast(max_supported_threads)) + { + _func(_current_peak); + } } }; diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_data_growth.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_data_growth.hpp new file mode 100644 index 0000000000..31b03cd288 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_data_growth.hpp @@ -0,0 +1,63 @@ +// MIT License +// +// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "core/concepts.hpp" +#include "core/containers/stable_vector.hpp" + +#include +#include + +namespace rocprofsys +{ +using grow_functor_t = int64_t (*)(int64_t); + +inline auto& +grow_functors() +{ + static auto _v = container::stable_vector{}; + return _v; +} + +inline auto& +get_peak_num_threads_callback() +{ + static std::function _v = []() -> int64_t { + return static_cast(max_supported_threads); + }; + return _v; +} + +inline int64_t +get_current_peak_num_threads() +{ + return get_peak_num_threads_callback()(); +} + +inline void +set_peak_num_threads_callback(std::function _cb) +{ + get_peak_num_threads_callback() = std::move(_cb); +} + +} // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_info.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_info.cpp index 37846e1d39..39c5f86ae4 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_info.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_info.cpp @@ -30,6 +30,7 @@ #include "library/causal/delay.hpp" #include "library/runtime.hpp" #include "library/thread_data.hpp" +#include "library/thread_data_growth.hpp" #include #include @@ -111,6 +112,13 @@ init_index_data(int64_t _tid, bool _offset = false) thread_local int64_t offset_causal_count = 0; const auto unknown_thread = std::optional{}; int64_t peak_num_threads = max_supported_threads; + +// Register callback to allow thread_data containers to query peak_num_threads +// when they are instantiated, ensuring late-instantiated containers are properly sized. +const auto peak_num_threads_callback_registered = []() { + set_peak_num_threads_callback([]() -> int64_t { return peak_num_threads; }); + return true; +}(); } // namespace std::string diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/regions.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/regions.cpp index 614d4c1d01..134759f696 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/regions.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/regions.cpp @@ -124,6 +124,12 @@ rocprofsys_pop_trace_hidden(const char* name) rocprofsys::component::category_region::stop(name); } +extern "C" void +rocprofsys_flush_pending_region_cache_hidden() +{ + flush_pending_cached_entries(); +} + //======================================================================================// /// /// diff --git a/projects/rocprofiler-systems/tests/rocpd-validation-rules/roctx/sdk-metrics-rules.json b/projects/rocprofiler-systems/tests/rocpd-validation-rules/roctx/sdk-metrics-rules.json index b61b1c9748..7c8eee6b27 100644 --- a/projects/rocprofiler-systems/tests/rocpd-validation-rules/roctx/sdk-metrics-rules.json +++ b/projects/rocprofiler-systems/tests/rocpd-validation-rules/roctx/sdk-metrics-rules.json @@ -60,24 +60,24 @@ }, { "comparison": "equals", - "description": "Verify that 'roctxMarkA' appears at 5 times in table 'regions'", - "error_message": "Expected 5 'roctxMarkA' entries in `regions` table", + "description": "Verify that roctxMark markers appear 5 times in table 'regions'", + "error_message": "Expected 5 roctxMark marker entries in `regions` table", "expected_result": 5, - "query": "SELECT COUNT(*) FROM regions WHERE category = 'rocm_marker_api' AND name = 'roctxMarkA';" + "query": "SELECT COUNT(*) FROM regions WHERE category = 'rocm_marker_api' AND name LIKE 'roctxMark_%';" }, { "comparison": "equals", - "description": "Verify that 'roctxRangePop' appears at 3 times in table 'regions'", - "error_message": "Expected 3 'roctxRangePop' entries in `regions` table", + "description": "Verify that roctxRangePush markers appear 3 times in table 'regions'", + "error_message": "Expected 3 roctxRangePush marker entries in `regions` table", "expected_result": 3, - "query": "SELECT COUNT(*) FROM regions WHERE category = 'rocm_marker_api' AND name = 'roctxRangePop';" + "query": "SELECT COUNT(*) FROM regions WHERE category = 'rocm_marker_api' AND name LIKE 'roctxRangePush_%';" }, { "comparison": "equals", - "description": "Verify that 'roctxRangeStop' appears at 2 times in table 'regions'", - "error_message": "Expected 2 'roctxRangeStop' entries in `regions` table", + "description": "Verify that roctxRangeStart markers appear 2 times in table 'regions'", + "error_message": "Expected 2 roctxRangeStart marker entries in `regions` table", "expected_result": 2, - "query": "SELECT COUNT(*) FROM regions WHERE category = 'rocm_marker_api' AND name = 'roctxRangeStop';" + "query": "SELECT COUNT(*) FROM regions WHERE category = 'rocm_marker_api' AND name LIKE 'roctxRangeStart_%';" } ] }, diff --git a/projects/rocprofiler-systems/tests/rocprof-sys-annotate-tests.cmake b/projects/rocprofiler-systems/tests/rocprof-sys-annotate-tests.cmake index dbe70bc9ef..a6ffdfb465 100644 --- a/projects/rocprofiler-systems/tests/rocprof-sys-annotate-tests.cmake +++ b/projects/rocprofiler-systems/tests/rocprof-sys-annotate-tests.cmake @@ -36,6 +36,8 @@ if( ) set(_annotate_environment "${_base_environment}" + "ROCPROFSYS_TRACE_CACHED=OFF" + "ROCPROFSYS_TRACE_LEGACY=ON" "ROCPROFSYS_TIMEMORY_COMPONENTS=thread_cpu_clock papi_array" "ROCPROFSYS_PAPI_EVENTS=perf::PERF_COUNT_SW_CPU_CLOCK" "ROCPROFSYS_USE_SAMPLING=OFF" @@ -82,6 +84,8 @@ if( else() set(_annotate_environment "${_base_environment}" + "ROCPROFSYS_TRACE_CACHED=OFF" + "ROCPROFSYS_TRACE_LEGACY=ON" "ROCPROFSYS_TIMEMORY_COMPONENTS=thread_cpu_clock" "ROCPROFSYS_USE_SAMPLING=OFF" ) diff --git a/projects/rocprofiler-systems/tests/rocprof-sys-binary-tests.cmake b/projects/rocprofiler-systems/tests/rocprof-sys-binary-tests.cmake index 786f527ebc..2d566e601b 100644 --- a/projects/rocprofiler-systems/tests/rocprof-sys-binary-tests.cmake +++ b/projects/rocprofiler-systems/tests/rocprof-sys-binary-tests.cmake @@ -233,7 +233,7 @@ rocprofiler_systems_add_bin_test( TIMEOUT 45 PASS_REGEX "ENVIRONMENT VARIABLE,[ \n]+ROCPROFSYS_CI_SKIP_PUSH_POP_CHECK,[ \n]+ROCPROFSYS_THREAD_POOL_SIZE,[ \n]+ROCPROFSYS_USE_PID,[ \n]+" - FAIL_REGEX "ROCPROFSYS_TRACE|ROCPROFSYS_ABORT_FAIL_REGEX" + FAIL_REGEX "ROCPROFSYS_TRACE_LEGACY|ROCPROFSYS_TRACE_CACHED|ROCPROFSYS_ABORT_FAIL_REGEX" ) string( @@ -270,7 +270,7 @@ rocprofiler_systems_add_bin_test( txt json xml --force TIMEOUT 45 LABELS "rocprofiler-systems-avail" - ENVIRONMENT "ROCPROFSYS_TRACE=OFF;ROCPROFSYS_PROFILE=ON" + ENVIRONMENT "ROCPROFSYS_TRACE_LEGACY=OFF;ROCPROFSYS_TRACE_CACHED=OFF;ROCPROFSYS_PROFILE=ON" PASS_REGEX "Outputting JSON configuration file '${_AVAIL_CFG_PATH}tweak\\\.json'(.*)Outputting XML configuration file '${_AVAIL_CFG_PATH}tweak\\\.xml'(.*)Outputting text configuration file '${_AVAIL_CFG_PATH}tweak\\\.cfg'(.*)" ) diff --git a/projects/rocprofiler-systems/tests/rocprof-sys-gpu-connect-tests.cmake b/projects/rocprofiler-systems/tests/rocprof-sys-gpu-connect-tests.cmake index 3c5a285d1f..d9c1cc9ffe 100644 --- a/projects/rocprofiler-systems/tests/rocprof-sys-gpu-connect-tests.cmake +++ b/projects/rocprofiler-systems/tests/rocprof-sys-gpu-connect-tests.cmake @@ -26,7 +26,10 @@ # # -------------------------------------------------------------------------------------- # +# Use legacy trace mode for AMD SMI counters - cached mode doesn't support real-time counter tracking set(_gpu_connect_environment + "ROCPROFSYS_TRACE_CACHED=OFF" + "ROCPROFSYS_TRACE_LEGACY=ON" "ROCPROFSYS_ROCM_DOMAINS=hip_runtime_api" "ROCPROFSYS_AMD_SMI_METRICS=busy,temp,power,xgmi,pcie" "ROCPROFSYS_SAMPLING_CPUS=none" diff --git a/projects/rocprofiler-systems/tests/rocprof-sys-instrument-tests.cmake b/projects/rocprofiler-systems/tests/rocprof-sys-instrument-tests.cmake index 407eb1a401..1eb01a5a51 100644 --- a/projects/rocprofiler-systems/tests/rocprof-sys-instrument-tests.cmake +++ b/projects/rocprofiler-systems/tests/rocprof-sys-instrument-tests.cmake @@ -74,5 +74,5 @@ rocprofiler_systems_add_test( REWRITE_ARGS -e -v 2 --min-instructions=8 RUN_ARGS 10 4 1000 ENVIRONMENT - "${_lock_environment};ROCPROFSYS_FLAT_PROFILE=ON;ROCPROFSYS_PROFILE=OFF;ROCPROFSYS_TRACE=ON;ROCPROFSYS_SAMPLING_KEEP_INTERNAL=OFF" + "${_lock_environment};ROCPROFSYS_FLAT_PROFILE=ON;ROCPROFSYS_PROFILE=OFF;ROCPROFSYS_TRACE_LEGACY=OFF;ROCPROFSYS_TRACE_CACHED=ON;ROCPROFSYS_SAMPLING_KEEP_INTERNAL=OFF" ) diff --git a/projects/rocprofiler-systems/tests/rocprof-sys-mpi-tests.cmake b/projects/rocprofiler-systems/tests/rocprof-sys-mpi-tests.cmake index b9f2e28d28..425e7c64f4 100644 --- a/projects/rocprofiler-systems/tests/rocprof-sys-mpi-tests.cmake +++ b/projects/rocprofiler-systems/tests/rocprof-sys-mpi-tests.cmake @@ -51,9 +51,11 @@ rocprofiler_systems_add_test( REWRITE_RUN_PASS_REGEX "(/[A-Za-z-]+/perfetto-trace-0.proto).*(/[A-Za-z-]+/wall_clock-0.txt')" REWRITE_RUN_FAIL_REGEX - "(perfetto-trace|trip_count|sampling_percent|sampling_cpu_clock|sampling_wall_clock|wall_clock)-[0-9][0-9]+.(json|txt|proto)|ROCPROFSYS_ABORT_FAIL_REGEX" + "Outputting.*(perfetto-trace|trip_count|sampling_percent|sampling_cpu_clock|sampling_wall_clock|wall_clock)-[0-9][0-9]+.(json|txt|proto)|ROCPROFSYS_ABORT_FAIL_REGEX" ) +# mpi-perfetto-merge requires legacy trace mode because MPI trace combining +# uses MPI communication (mpi_get) which is only implemented in the legacy path rocprofiler_systems_add_test( SKIP_RUNTIME NAME "mpi-perfetto-merge" @@ -70,7 +72,8 @@ rocprofiler_systems_add_test( line --min-instructions 0 - ENVIRONMENT "${_base_environment};ROCPROFSYS_VERBOSE=1" + ENVIRONMENT + "${_base_environment};ROCPROFSYS_VERBOSE=1;ROCPROFSYS_TRACE_CACHED=OFF;ROCPROFSYS_TRACE_LEGACY=ON;ROCPROFSYS_PERFETTO_COMBINE_TRACES=ON" REWRITE_RUN_PASS_REGEX "Successfully executed: .+rocprof-sys-merge-output.sh.*" REWRITE_RUN_FAIL_REGEX @@ -123,7 +126,8 @@ rocprofiler_systems_add_test( ) set(_mpip_environment - "ROCPROFSYS_TRACE=ON" + "ROCPROFSYS_TRACE_LEGACY=OFF" + "ROCPROFSYS_TRACE_CACHED=ON" "ROCPROFSYS_PROFILE=ON" "ROCPROFSYS_USE_SAMPLING=OFF" "ROCPROFSYS_USE_PROCESS_SAMPLING=OFF" @@ -138,7 +142,8 @@ set(_mpip_environment ) set(_mpip_all2all_environment - "ROCPROFSYS_TRACE=ON" + "ROCPROFSYS_TRACE_LEGACY=OFF" + "ROCPROFSYS_TRACE_CACHED=ON" "ROCPROFSYS_PROFILE=ON" "ROCPROFSYS_USE_SAMPLING=OFF" "ROCPROFSYS_USE_PROCESS_SAMPLING=OFF" diff --git a/projects/rocprofiler-systems/tests/rocprof-sys-nic-perf.cmake b/projects/rocprofiler-systems/tests/rocprof-sys-nic-perf.cmake index 3826714d2d..268bb3fc70 100644 --- a/projects/rocprofiler-systems/tests/rocprof-sys-nic-perf.cmake +++ b/projects/rocprofiler-systems/tests/rocprof-sys-nic-perf.cmake @@ -25,8 +25,11 @@ execute_process( message(STATUS "The list of all PAPI network events is ${_event_list}") +# Use legacy trace mode for network stats - cached mode doesn't support real-time counter tracking set(_nic_perf_environment "${_base_environment}" + "ROCPROFSYS_TRACE_CACHED=OFF" + "ROCPROFSYS_TRACE_LEGACY=ON" "ROCPROFSYS_OUTPUT_PATH=${PROJECT_BINARY_DIR}/rocprof-sys-tests-output/nic-performance" "ROCPROFSYS_USE_PID=OFF" "ROCPROFSYS_VERBOSE=1" diff --git a/projects/rocprofiler-systems/tests/rocprof-sys-openmp-tests.cmake b/projects/rocprofiler-systems/tests/rocprof-sys-openmp-tests.cmake index d488052e21..698281c9dc 100644 --- a/projects/rocprofiler-systems/tests/rocprof-sys-openmp-tests.cmake +++ b/projects/rocprofiler-systems/tests/rocprof-sys-openmp-tests.cmake @@ -22,7 +22,8 @@ if(NOT EXISTS "${ROCM_LLVM_LIB_PATH}/libomptarget.so" AND ROCPROFSYS_USE_ROCM) endif() set(_ompt_environment - "ROCPROFSYS_TRACE=ON" + "ROCPROFSYS_TRACE_LEGACY=OFF" + "ROCPROFSYS_TRACE_CACHED=ON" "ROCPROFSYS_PROFILE=ON" "ROCPROFSYS_TIME_OUTPUT=OFF" "ROCPROFSYS_USE_OMPT=ON" @@ -126,6 +127,7 @@ if(ROCPROFSYS_OMPVV_HOST_TESTS) -e -v 1 --label return args SAMPLING_TIMEOUT 300 REWRITE_TIMEOUT 300 + RUNTIME_TIMEOUT 600 ENVIRONMENT "${_ompt_environment};ROCPROFSYS_COUT_OUTPUT=ON;ROCPROFSYS_CI_SKIP_PUSH_POP_CHECK=ON" REWRITE_RUN_PASS_REGEX "${_OMPT_PASS_REGEX}" diff --git a/projects/rocprofiler-systems/tests/rocprof-sys-pthread-tests.cmake b/projects/rocprofiler-systems/tests/rocprof-sys-pthread-tests.cmake index cfeb05dece..256f3b8386 100644 --- a/projects/rocprofiler-systems/tests/rocprof-sys-pthread-tests.cmake +++ b/projects/rocprofiler-systems/tests/rocprof-sys-pthread-tests.cmake @@ -34,7 +34,7 @@ rocprofiler_systems_add_test( RUNTIME_ARGS -e -i 256 RUN_ARGS 30 4 1000 ENVIRONMENT - "${_lock_environment};ROCPROFSYS_PROFILE=ON;ROCPROFSYS_TRACE=ON;ROCPROFSYS_COLLAPSE_THREADS=OFF;ROCPROFSYS_SAMPLING_REALTIME=ON;ROCPROFSYS_SAMPLING_REALTIME_FREQ=10;ROCPROFSYS_SAMPLING_REALTIME_TIDS=0;ROCPROFSYS_SAMPLING_KEEP_INTERNAL=OFF" + "${_lock_environment};ROCPROFSYS_PROFILE=ON;ROCPROFSYS_TRACE_LEGACY=OFF;ROCPROFSYS_TRACE_CACHED=ON;ROCPROFSYS_COLLAPSE_THREADS=OFF;ROCPROFSYS_SAMPLING_REALTIME=ON;ROCPROFSYS_SAMPLING_REALTIME_FREQ=10;ROCPROFSYS_SAMPLING_REALTIME_TIDS=0;ROCPROFSYS_SAMPLING_KEEP_INTERNAL=OFF" REWRITE_RUN_PASS_REGEX "wall_clock .*\\|_pthread_create .* 4 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000" RUNTIME_PASS_REGEX @@ -50,7 +50,7 @@ rocprofiler_systems_add_test( TrampRecursive RUN_ARGS 10 4 1000 ENVIRONMENT - "${_lock_environment};ROCPROFSYS_FLAT_PROFILE=ON;ROCPROFSYS_PROFILE=ON;ROCPROFSYS_TRACE=OFF;ROCPROFSYS_SAMPLING_KEEP_INTERNAL=OFF" + "${_lock_environment};ROCPROFSYS_FLAT_PROFILE=ON;ROCPROFSYS_PROFILE=ON;ROCPROFSYS_TRACE_LEGACY=OFF;ROCPROFSYS_SAMPLING_KEEP_INTERNAL=OFF" REWRITE_RUN_PASS_REGEX "start_thread (.*) 4 (.*) pthread_mutex_lock (.*) 4000 (.*) pthread_mutex_unlock (.*) 4000" ) diff --git a/projects/rocprofiler-systems/tests/rocprof-sys-python-tests.cmake b/projects/rocprofiler-systems/tests/rocprof-sys-python-tests.cmake index 7783f23343..d7471c2d93 100644 --- a/projects/rocprofiler-systems/tests/rocprof-sys-python-tests.cmake +++ b/projects/rocprofiler-systems/tests/rocprof-sys-python-tests.cmake @@ -164,15 +164,28 @@ foreach(_VERSION ${ROCPROFSYS_PYTHON_VERSIONS}) TEST "" "NAME;TIMEMORY_METRIC;TIMEMORY_FILE;PERFETTO_FILE" - "ARGS;PERFETTO_METRIC;ROCPD_FILE;ROCPD_RULES" + "ARGS;TIMEMORY_ARGS;PERFETTO_ARGS;PERFETTO_METRIC;ROCPD_FILE;ROCPD_RULES" ${ARGN} ) + # Use specific args if provided, otherwise fall back to common ARGS + if(TEST_TIMEMORY_ARGS) + set(_TIMEMORY_VALIDATION_ARGS ${TEST_TIMEMORY_ARGS}) + else() + set(_TIMEMORY_VALIDATION_ARGS ${TEST_ARGS}) + endif() + + if(TEST_PERFETTO_ARGS) + set(_PERFETTO_VALIDATION_ARGS ${TEST_PERFETTO_ARGS}) + else() + set(_PERFETTO_VALIDATION_ARGS ${TEST_ARGS}) + endif() + rocprofiler_systems_add_python_test( NAME ${TEST_NAME}-validate-timemory COMMAND ${_PYTHON_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/validate-timemory-json.py - -m ${TEST_TIMEMORY_METRIC} ${TEST_ARGS} -i + -m ${TEST_TIMEMORY_METRIC} ${_TIMEMORY_VALIDATION_ARGS} -i PYTHON_VERSION ${_VERSION} FILE rocprof-sys-tests-output/${TEST_NAME}/${_VERSION}/${TEST_TIMEMORY_FILE} DEPENDS ${TEST_NAME}-${_VERSION} @@ -185,7 +198,7 @@ foreach(_VERSION ${ROCPROFSYS_PYTHON_VERSIONS}) NAME ${TEST_NAME}-validate-perfetto COMMAND ${_PYTHON_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/validate-perfetto-proto.py - -m ${TEST_PERFETTO_METRIC} ${TEST_ARGS} -p -t + -m ${TEST_PERFETTO_METRIC} ${_PERFETTO_VALIDATION_ARGS} -p -t /opt/trace_processor/bin/trace_processor_shell -i PYTHON_VERSION ${_VERSION} FILE rocprof-sys-tests-output/${TEST_NAME}/${_VERSION}/${TEST_PERFETTO_FILE} @@ -217,7 +230,8 @@ foreach(_VERSION ${ROCPROFSYS_PYTHON_VERSIONS}) endif() endfunction() - set(python_source_labels + # Timemory validation uses hierarchical output with multiple entries at different depths + set(python_source_timemory_labels main_loop run fib @@ -228,7 +242,7 @@ foreach(_VERSION ${ROCPROFSYS_PYTHON_VERSIONS}) inefficient _sum ) - set(python_source_count + set(python_source_timemory_count 5 3 3 @@ -239,7 +253,7 @@ foreach(_VERSION ${ROCPROFSYS_PYTHON_VERSIONS}) 3 3 ) - set(python_source_depth + set(python_source_timemory_depth 0 1 2 @@ -251,6 +265,29 @@ foreach(_VERSION ${ROCPROFSYS_PYTHON_VERSIONS}) 3 ) + # Perfetto (cached mode) aggregates entries by name + set(python_source_perfetto_labels + main_loop + run + fib + inefficient + _sum + ) + set(python_source_perfetto_count + 5 + 3 + 24 + 3 + 3 + ) + set(python_source_perfetto_depth + 0 + 1 + 2 + 2 + 3 + ) + set(python_source_categories python user) rocprofiler_systems_add_python_validation_test( @@ -259,14 +296,17 @@ foreach(_VERSION ${ROCPROFSYS_PYTHON_VERSIONS}) TIMEMORY_FILE "trip_count.json" PERFETTO_FILE "perfetto-trace.proto" PERFETTO_METRIC ${python_source_categories} - ARGS -l ${python_source_labels} -c ${python_source_count} -d - ${python_source_depth} + TIMEMORY_ARGS -l ${python_source_timemory_labels} -c ${python_source_timemory_count} -d + ${python_source_timemory_depth} + PERFETTO_ARGS -l ${python_source_perfetto_labels} -c ${python_source_perfetto_count} -d + ${python_source_perfetto_depth} ROCPD_FILE "rocpd.db" ROCPD_RULES "${CMAKE_CURRENT_LIST_DIR}/rocpd-validation-rules/python/python-source-rules.json" ) - set(python_builtin_labels + # Timemory validation uses hierarchical output with multiple entries at different depths + set(python_builtin_timemory_labels [run][builtin.py:31] [fib][builtin.py:13] [fib][builtin.py:13] @@ -280,7 +320,7 @@ foreach(_VERSION ${ROCPROFSYS_PYTHON_VERSIONS}) [fib][builtin.py:13] [inefficient][builtin.py:17] ) - set(python_builtin_count + set(python_builtin_timemory_count 5 5 10 @@ -294,7 +334,7 @@ foreach(_VERSION ${ROCPROFSYS_PYTHON_VERSIONS}) 10 5 ) - set(python_builtin_depth + set(python_builtin_timemory_depth 0 1 2 @@ -309,14 +349,26 @@ foreach(_VERSION ${ROCPROFSYS_PYTHON_VERSIONS}) 1 ) + # Perfetto validation with trace caching aggregates all calls to the same function, + # so we only expect one entry per unique label rather than hierarchical entries. + set(python_builtin_perfetto_labels + [run][builtin.py:31] + [fib][builtin.py:13] + [inefficient][builtin.py:17] + ) + set(python_builtin_perfetto_count 5 445 5) + set(python_builtin_perfetto_depth 0 1 1) + rocprofiler_systems_add_python_validation_test( NAME python-builtin TIMEMORY_METRIC "trip_count" TIMEMORY_FILE "trip_count.json" PERFETTO_METRIC "python" PERFETTO_FILE "perfetto-trace.proto" - ARGS -l ${python_builtin_labels} -c ${python_builtin_count} -d - ${python_builtin_depth} + TIMEMORY_ARGS -l ${python_builtin_timemory_labels} -c ${python_builtin_timemory_count} -d + ${python_builtin_timemory_depth} + PERFETTO_ARGS -l ${python_builtin_perfetto_labels} -c ${python_builtin_perfetto_count} -d + ${python_builtin_perfetto_depth} ROCPD_FILE "rocpd.db" ROCPD_RULES "${CMAKE_CURRENT_LIST_DIR}/rocpd-validation-rules/python/python-builtin-rules.json" diff --git a/projects/rocprofiler-systems/tests/rocprof-sys-roctx-tests.cmake b/projects/rocprofiler-systems/tests/rocprof-sys-roctx-tests.cmake index 61321638c3..986826cc87 100644 --- a/projects/rocprofiler-systems/tests/rocprof-sys-roctx-tests.cmake +++ b/projects/rocprofiler-systems/tests/rocprof-sys-roctx-tests.cmake @@ -37,8 +37,11 @@ endif() # -------------------------------------------------------------------------------------- # # Ensure ROCPROFSYS_ROCM_DOMAINS is defined +# Use legacy trace mode for roctx tests to preserve depth information set(_roctx_environment "${_base_environment}" + "ROCPROFSYS_TRACE_LEGACY=ON" + "ROCPROFSYS_TRACE_CACHED=OFF" "ROCPROFSYS_ROCM_DOMAINS=hip_runtime_api,marker_api,kernel_dispatch" ) @@ -56,7 +59,8 @@ rocprofiler_systems_add_test( ENVIRONMENT "${_roctx_environment}" ) -set(ROCTX_LABEL +# Legacy mode preserves individual entries with their original depths +set(ROCTX_LEGACY_LABEL roctxMark_GPU_workload roctxRangePush_run_profiling roctxRangeStart_GPU_Compute @@ -70,7 +74,7 @@ set(ROCTX_LABEL roctxMark_Finished_GPU ) -set(ROCTX_COUNT +set(ROCTX_LEGACY_COUNT 1 1 1 @@ -84,7 +88,7 @@ set(ROCTX_COUNT 1 ) -set(ROCTX_DEPTH +set(ROCTX_LEGACY_DEPTH 1 1 2 @@ -98,6 +102,57 @@ set(ROCTX_DEPTH 1 ) +# Cached mode aggregates entries by name, so counts reflect total occurrences +set(ROCTX_CACHED_LABEL + roctxMark_GPU_workload + roctxRangePush_HIP_Kernel + roctxRangeStart_GPU_Compute + roctxGetThreadId + roctxMark_RoctxProfilerPause_End + roctxMark_Thread_Start + roctxMark_End + roctxRangePush_run_profiling + roctxMark_Finished_GPU +) + +set(ROCTX_CACHED_COUNT + 1 + 2 + 2 + 1 + 1 + 1 + 1 + 1 + 1 +) + +set(ROCTX_CACHED_DEPTH + 1 + 1 + 1 + 1 + 1 + 2 + 1 + 1 + 1 +) + +# Determine which expectations to use based on trace mode in environment +set(ROCTX_LABEL ${ROCTX_CACHED_LABEL}) +set(ROCTX_COUNT ${ROCTX_CACHED_COUNT}) +set(ROCTX_DEPTH ${ROCTX_CACHED_DEPTH}) + +# Check if ROCPROFSYS_TRACE_LEGACY=ON is set in the test environment +list(FIND _roctx_environment "ROCPROFSYS_TRACE_LEGACY=ON" _legacy_idx) +if(_legacy_idx GREATER -1) + # Legacy mode is enabled, use legacy expectations + set(ROCTX_LABEL ${ROCTX_LEGACY_LABEL}) + set(ROCTX_COUNT ${ROCTX_LEGACY_COUNT}) + set(ROCTX_DEPTH ${ROCTX_LEGACY_DEPTH}) +endif() + rocprofiler_systems_add_validation_test( NAME roctx-api-sampling PERFETTO_METRIC "rocm_marker_api" diff --git a/projects/rocprofiler-systems/tests/rocprof-sys-testing.cmake b/projects/rocprofiler-systems/tests/rocprof-sys-testing.cmake index 64a5ba1a7e..b1b585ad90 100644 --- a/projects/rocprofiler-systems/tests/rocprof-sys-testing.cmake +++ b/projects/rocprofiler-systems/tests/rocprof-sys-testing.cmake @@ -103,7 +103,8 @@ endif() set(_test_openmp_env "OMP_PROC_BIND=spread" "OMP_PLACES=threads" "OMP_NUM_THREADS=2") set(_base_environment - "ROCPROFSYS_TRACE=ON" + "ROCPROFSYS_TRACE_LEGACY=OFF" + "ROCPROFSYS_TRACE_CACHED=ON" "ROCPROFSYS_PROFILE=ON" "ROCPROFSYS_USE_SAMPLING=ON" "ROCPROFSYS_USE_PROCESS_SAMPLING=ON" @@ -114,7 +115,8 @@ set(_base_environment ) set(_flat_environment - "ROCPROFSYS_TRACE=ON" + "ROCPROFSYS_TRACE_LEGACY=OFF" + "ROCPROFSYS_TRACE_CACHED=ON" "ROCPROFSYS_PROFILE=ON" "ROCPROFSYS_TIME_OUTPUT=OFF" "ROCPROFSYS_COUT_OUTPUT=ON" @@ -144,7 +146,8 @@ set(_lock_environment ) set(_perfetto_environment - "ROCPROFSYS_TRACE=ON" + "ROCPROFSYS_TRACE_LEGACY=OFF" + "ROCPROFSYS_TRACE_CACHED=ON" "ROCPROFSYS_PROFILE=OFF" "ROCPROFSYS_USE_SAMPLING=ON" "ROCPROFSYS_USE_PROCESS_SAMPLING=ON" @@ -156,7 +159,8 @@ set(_perfetto_environment ) set(_timemory_environment - "ROCPROFSYS_TRACE=OFF" + "ROCPROFSYS_TRACE_LEGACY=OFF" + "ROCPROFSYS_TRACE_CACHED=OFF" "ROCPROFSYS_PROFILE=ON" "ROCPROFSYS_USE_SAMPLING=ON" "ROCPROFSYS_USE_PROCESS_SAMPLING=ON" @@ -177,7 +181,8 @@ set(_causal_environment ) set(_python_environment - "ROCPROFSYS_TRACE=ON" + "ROCPROFSYS_TRACE_LEGACY=OFF" + "ROCPROFSYS_TRACE_CACHED=ON" "ROCPROFSYS_PROFILE=ON" "ROCPROFSYS_USE_SAMPLING=OFF" "ROCPROFSYS_USE_PROCESS_SAMPLING=ON" @@ -190,7 +195,8 @@ set(_python_environment ) set(_attach_environment - "ROCPROFSYS_TRACE=ON" + "ROCPROFSYS_TRACE_LEGACY=OFF" + "ROCPROFSYS_TRACE_CACHED=ON" "ROCPROFSYS_PROFILE=ON" "ROCPROFSYS_USE_SAMPLING=OFF" "ROCPROFSYS_USE_PROCESS_SAMPLING=ON" @@ -204,7 +210,8 @@ set(_attach_environment ) set(_rccl_environment - "ROCPROFSYS_TRACE=ON" + "ROCPROFSYS_TRACE_LEGACY=OFF" + "ROCPROFSYS_TRACE_CACHED=ON" "ROCPROFSYS_PROFILE=ON" "ROCPROFSYS_USE_SAMPLING=OFF" "ROCPROFSYS_USE_PROCESS_SAMPLING=ON" @@ -217,7 +224,8 @@ set(_rccl_environment ) set(_window_environment - "ROCPROFSYS_TRACE=ON" + "ROCPROFSYS_TRACE_LEGACY=OFF" + "ROCPROFSYS_TRACE_CACHED=ON" "ROCPROFSYS_PROFILE=ON" "ROCPROFSYS_USE_SAMPLING=OFF" "ROCPROFSYS_USE_PROCESS_SAMPLING=OFF" @@ -1329,6 +1337,7 @@ function(ROCPROFILER_SYSTEMS_ADD_VALIDATION_TEST) ) endif() + set(_EXIST_FILES_TESTS "") foreach(_FILE ${TEST_EXIST_FILES}) add_test( NAME validate-${TEST_NAME}-${_FILE}-exists @@ -1337,6 +1346,7 @@ function(ROCPROFILER_SYSTEMS_ADD_VALIDATION_TEST) ${PROJECT_BINARY_DIR}/rocprof-sys-tests-output/${TEST_NAME}/${_FILE} WORKING_DIRECTORY ${PROJECT_BINARY_DIR} ) + list(APPEND _EXIST_FILES_TESTS "validate-${TEST_NAME}-${_FILE}-exists") endforeach() if(TEST_TIMEMORY_FILE) @@ -1432,6 +1442,18 @@ function(ROCPROFILER_SYSTEMS_ADD_VALIDATION_TEST) ${TEST_PROPERTIES} ) endforeach() + + # Set properties for file existence validation tests + foreach(_TEST ${_EXIST_FILES_TESTS}) + set_tests_properties( + ${_TEST} + PROPERTIES + TIMEOUT ${TEST_TIMEOUT} + LABELS "${TEST_LABELS}" + DEPENDS "${TEST_DEPENDS};${TEST_NAME}" + FIXTURES_REQUIRED "${_VALIDATION_FIXTURES}" + ) + endforeach() endfunction() # -------------------------------------------------------------------------------------- # @@ -1456,7 +1478,8 @@ function(ROCPROFILER_SYSTEMS_ADD_BIN_TEST) if(NOT TEST_ENVIRONMENT) set(TEST_ENVIRONMENT - "ROCPROFSYS_TRACE=ON" + "ROCPROFSYS_TRACE_LEGACY=OFF" + "ROCPROFSYS_TRACE_CACHED=ON" "ROCPROFSYS_PROFILE=ON" "ROCPROFSYS_USE_SAMPLING=ON" "ROCPROFSYS_TIME_OUTPUT=OFF" diff --git a/projects/rocprofiler-systems/tests/rocprof-sys-thread-limit-tests.cmake b/projects/rocprofiler-systems/tests/rocprof-sys-thread-limit-tests.cmake index 3e4ab1fc71..e515374178 100644 --- a/projects/rocprofiler-systems/tests/rocprof-sys-thread-limit-tests.cmake +++ b/projects/rocprofiler-systems/tests/rocprof-sys-thread-limit-tests.cmake @@ -32,7 +32,8 @@ endif() set(_thread_limit_environment "${_base_environment}" - "ROCPROFSYS_TRACE=ON" + "ROCPROFSYS_TRACE_LEGACY=OFF" + "ROCPROFSYS_TRACE_CACHED=ON" "ROCPROFSYS_PROFILE=ON" "ROCPROFSYS_COUT_OUTPUT=ON" "ROCPROFSYS_USE_SAMPLING=ON"