Adding GPU index as a parameter for ATT (#547)

* Adding GPU index as a parameter for ATT

* Tidy fix

* Using tokenize

* Update tests/rocprofv3/advanced-thread-trace/CMakeLists.txt

Co-authored-by: Indic, Vladimir <Vladimir.Indic@amd.com>

* Update tests/rocprofv3/advanced-thread-trace/CMakeLists.txt

* Adding error logging. Using idx instead of id.

---------

Co-authored-by: Giovanni <gbaraldi@amd.com>
Co-authored-by: Indic, Vladimir <Vladimir.Indic@amd.com>

[ROCm/rocprofiler-sdk commit: fd6f96ffb5]
This commit is contained in:
Baraldi, Giovanni
2025-08-04 23:15:50 +02:00
committato da GitHub
parent 6b2a4fcfc2
commit 6a6b16be93
5 ha cambiato i file con 67 aggiunte e 0 eliminazioni
@@ -781,6 +781,13 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins
type=str,
)
att_options.add_argument(
"--att-gpu-index",
help="Comma-separated list of GPU index(es) to enable thread trace. Default: All",
default=None,
type=str,
)
att_options.add_argument(
"--att-perfcounters",
help="(gfx9) List of performance counters, and optionally their SIMD mask.",
@@ -1553,6 +1560,12 @@ def run(app_args, args, **kwargs):
args.att_serialize_all,
overwrite=True,
)
if args.att_gpu_index:
update_env(
"ROCPROF_ATT_PARAM_GPU_INDEX",
args.att_gpu_index,
overwrite=True,
)
if check_att_capability(args):
update_env(
"ROCPROF_ATT_LIBRARY_PATH",
@@ -102,6 +102,9 @@ The following table lists the parameters relevant to thread tracing:
| | | | | Shorthand for att-perfcounter-ctrl and the att-perfcounters |
| | | | | related to compute unit activity such as VALU, SALU, etc. |
+--------------------------+---------+---------+-----------+--------------------------------------------------------------+
| att-gpu-index | Integer | | | Comma-separated list of integers. If enabled, only the GPU |
| | (List) | | | indexes in the list will be profiled by thread trace. |
+--------------------------+---------+---------+-----------+--------------------------------------------------------------+
For AMD Instinct accelerators, enable perfmon streaming using:
@@ -151,6 +151,7 @@ struct config : output_config
std::string pc_sampling_unit = get_env("ROCPROF_PC_SAMPLING_UNIT", "none");
std::string extra_counters_contents = get_env("ROCPROF_EXTRA_COUNTERS_CONTENTS", "");
std::string att_library_path = get_env("ROCPROF_ATT_LIBRARY_PATH", "");
std::string att_gpu_index = get_env("ROCPROF_ATT_PARAM_GPU_INDEX", "");
std::unordered_set<size_t> kernel_filter_range = {};
std::vector<std::set<std::string>> counters = {};
@@ -1941,10 +1941,28 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
ROCP_FATAL << "ATT Perf requires setting both perfcounter_ctrl and perfcounter list!";
}
auto gpu_idx_set = std::set<uint64_t>{};
for(const auto& entry :
rocprofiler::sdk::parse::tokenize(tool::get_config().att_gpu_index, ","))
{
try
{
gpu_idx_set.insert(std::stoi(entry));
} catch(std::exception& e)
{
ROCP_FATAL << "Invalid GPU Id string: " << entry << " - " << e.what();
}
}
const auto selecting_by_gpuid = !gpu_idx_set.empty();
for(auto& [id, agent] : tool_metadata->agents_map)
{
if(agent.type != ROCPROFILER_AGENT_TYPE_GPU) continue;
if(selecting_by_gpuid && gpu_idx_set.erase(agent.gpu_index) == 0) continue;
auto agent_params = global_parameters;
for(auto& counter : get_att_perfcounter_params(id, att_perf))
agent_params.push_back(counter);
@@ -1959,6 +1977,10 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
tool_data),
"thread trace service configure");
}
// Any agent not removed by above loop was not in the agents_map list
for(const auto& entry : gpu_idx_set)
ROCP_ERROR << "Invalid GPU Device Index: " << entry;
}
if(tool::get_config().counter_collection)
@@ -253,3 +253,31 @@ set_tests_properties(
rocprofv3-test-att-plus-pc-sampling
PROPERTIES TIMEOUT 90 LABELS "integration-tests;thread-trace;pc-sampling" DISABLED
${ATT_PLUS_PCS_DISABLE})
# Trace two GPUs
add_test(NAME rocprofv3-test-att-gpu-index-two-gpus
COMMAND $<TARGET_FILE:rocprofiler-sdk::rocprofv3> ${COMMON_PARAMS}/cmd_input -o
out --att-gpu-index 0,1 -- $<TARGET_FILE:vector-ops>)
set_tests_properties(
rocprofv3-test-att-gpu-index-two-gpus
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DISABLED ${IS_DISABLED})
# No machine has gpu index = 9999
add_test(NAME rocprofv3-test-att-gpu-index-will-fail
COMMAND $<TARGET_FILE:rocprofiler-sdk::rocprofv3> ${COMMON_PARAMS}/cmd_input -o
out --att-gpu-index 0,9999 -- $<TARGET_FILE:vector-ops>)
# The FAIL regex for this test is the ROCP_ERROR specific to invalid device index
set_tests_properties(
rocprofv3-test-att-gpu-index-will-fail
PROPERTIES TIMEOUT
45
LABELS
"integration-tests"
DISABLED
${IS_DISABLED}
WILL_FAIL
True
FAIL_REGULAR_EXPRESSION
"Invalid GPU Device Index")