From fd6f96ffb54054b405a6f05f800c64394126672d Mon Sep 17 00:00:00 2001 From: "Baraldi, Giovanni" Date: Mon, 4 Aug 2025 23:15:50 +0200 Subject: [PATCH] Adding GPU index as a parameter for ATT (#547) * Adding GPU index as a parameter for ATT * Tidy fix * Using tokenize * Update tests/rocprofv3/advanced-thread-trace/CMakeLists.txt Co-authored-by: Indic, Vladimir * Update tests/rocprofv3/advanced-thread-trace/CMakeLists.txt * Adding error logging. Using idx instead of id. --------- Co-authored-by: Giovanni Co-authored-by: Indic, Vladimir --- source/bin/rocprofv3.py | 13 +++++++++ source/docs/how-to/using-thread-trace.rst | 3 ++ source/lib/rocprofiler-sdk-tool/config.hpp | 1 + source/lib/rocprofiler-sdk-tool/tool.cpp | 22 +++++++++++++++ .../advanced-thread-trace/CMakeLists.txt | 28 +++++++++++++++++++ 5 files changed, 67 insertions(+) diff --git a/source/bin/rocprofv3.py b/source/bin/rocprofv3.py index 1b2acd5dbc..bc6be7a75c 100755 --- a/source/bin/rocprofv3.py +++ b/source/bin/rocprofv3.py @@ -781,6 +781,13 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins type=str, ) + att_options.add_argument( + "--att-gpu-index", + help="Comma-separated list of GPU index(es) to enable thread trace. Default: All", + default=None, + type=str, + ) + att_options.add_argument( "--att-perfcounters", help="(gfx9) List of performance counters, and optionally their SIMD mask.", @@ -1553,6 +1560,12 @@ def run(app_args, args, **kwargs): args.att_serialize_all, overwrite=True, ) + if args.att_gpu_index: + update_env( + "ROCPROF_ATT_PARAM_GPU_INDEX", + args.att_gpu_index, + overwrite=True, + ) if check_att_capability(args): update_env( "ROCPROF_ATT_LIBRARY_PATH", diff --git a/source/docs/how-to/using-thread-trace.rst b/source/docs/how-to/using-thread-trace.rst index 6ff22b10da..c78c0e0c2d 100644 --- a/source/docs/how-to/using-thread-trace.rst +++ b/source/docs/how-to/using-thread-trace.rst @@ -102,6 +102,9 @@ The following table lists the parameters relevant to thread tracing: | | | | | Shorthand for att-perfcounter-ctrl and the att-perfcounters | | | | | | related to compute unit activity such as VALU, SALU, etc. | +--------------------------+---------+---------+-----------+--------------------------------------------------------------+ +| att-gpu-index | Integer | | | Comma-separated list of integers. If enabled, only the GPU | +| | (List) | | | indexes in the list will be profiled by thread trace. | ++--------------------------+---------+---------+-----------+--------------------------------------------------------------+ For AMD Instinct accelerators, enable perfmon streaming using: diff --git a/source/lib/rocprofiler-sdk-tool/config.hpp b/source/lib/rocprofiler-sdk-tool/config.hpp index 7da04a9a9c..1bc1f826f3 100644 --- a/source/lib/rocprofiler-sdk-tool/config.hpp +++ b/source/lib/rocprofiler-sdk-tool/config.hpp @@ -151,6 +151,7 @@ struct config : output_config std::string pc_sampling_unit = get_env("ROCPROF_PC_SAMPLING_UNIT", "none"); std::string extra_counters_contents = get_env("ROCPROF_EXTRA_COUNTERS_CONTENTS", ""); std::string att_library_path = get_env("ROCPROF_ATT_LIBRARY_PATH", ""); + std::string att_gpu_index = get_env("ROCPROF_ATT_PARAM_GPU_INDEX", ""); std::unordered_set kernel_filter_range = {}; std::vector> counters = {}; diff --git a/source/lib/rocprofiler-sdk-tool/tool.cpp b/source/lib/rocprofiler-sdk-tool/tool.cpp index bf0cd3e6f5..7afcbd9f77 100644 --- a/source/lib/rocprofiler-sdk-tool/tool.cpp +++ b/source/lib/rocprofiler-sdk-tool/tool.cpp @@ -1941,10 +1941,28 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) ROCP_FATAL << "ATT Perf requires setting both perfcounter_ctrl and perfcounter list!"; } + auto gpu_idx_set = std::set{}; + + for(const auto& entry : + rocprofiler::sdk::parse::tokenize(tool::get_config().att_gpu_index, ",")) + { + try + { + gpu_idx_set.insert(std::stoi(entry)); + } catch(std::exception& e) + { + ROCP_FATAL << "Invalid GPU Id string: " << entry << " - " << e.what(); + } + } + + const auto selecting_by_gpuid = !gpu_idx_set.empty(); + for(auto& [id, agent] : tool_metadata->agents_map) { if(agent.type != ROCPROFILER_AGENT_TYPE_GPU) continue; + if(selecting_by_gpuid && gpu_idx_set.erase(agent.gpu_index) == 0) continue; + auto agent_params = global_parameters; for(auto& counter : get_att_perfcounter_params(id, att_perf)) agent_params.push_back(counter); @@ -1959,6 +1977,10 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) tool_data), "thread trace service configure"); } + + // Any agent not removed by above loop was not in the agents_map list + for(const auto& entry : gpu_idx_set) + ROCP_ERROR << "Invalid GPU Device Index: " << entry; } if(tool::get_config().counter_collection) diff --git a/tests/rocprofv3/advanced-thread-trace/CMakeLists.txt b/tests/rocprofv3/advanced-thread-trace/CMakeLists.txt index 7a938b5dde..57a78cfa36 100644 --- a/tests/rocprofv3/advanced-thread-trace/CMakeLists.txt +++ b/tests/rocprofv3/advanced-thread-trace/CMakeLists.txt @@ -253,3 +253,31 @@ set_tests_properties( rocprofv3-test-att-plus-pc-sampling PROPERTIES TIMEOUT 90 LABELS "integration-tests;thread-trace;pc-sampling" DISABLED ${ATT_PLUS_PCS_DISABLE}) + +# Trace two GPUs +add_test(NAME rocprofv3-test-att-gpu-index-two-gpus + COMMAND $ ${COMMON_PARAMS}/cmd_input -o + out --att-gpu-index 0,1 -- $) + +set_tests_properties( + rocprofv3-test-att-gpu-index-two-gpus + PROPERTIES TIMEOUT 45 LABELS "integration-tests" DISABLED ${IS_DISABLED}) + +# No machine has gpu index = 9999 +add_test(NAME rocprofv3-test-att-gpu-index-will-fail + COMMAND $ ${COMMON_PARAMS}/cmd_input -o + out --att-gpu-index 0,9999 -- $) + +# The FAIL regex for this test is the ROCP_ERROR specific to invalid device index +set_tests_properties( + rocprofv3-test-att-gpu-index-will-fail + PROPERTIES TIMEOUT + 45 + LABELS + "integration-tests" + DISABLED + ${IS_DISABLED} + WILL_FAIL + True + FAIL_REGULAR_EXPRESSION + "Invalid GPU Device Index")