[Rocprofiler-system]: Fix GPU event enumeration for rocprof-sys-avail and CLI option for parsing GPU HW Counters (#2476)

## Motivation

The `rocprof-sys-avail -H -c GPU` command is returning blank output which is expected to display a list of available GPU hardware counters instead.
The `rocprof-sys-sample` and `rocprof-sys-run` is missing the `--gpu-events` option for specifying GPU counter events during profiling.

## Technical Details

The initialize_event_info() function had a logic bug where it only called set_agents() if the agent_manager was empty, but the actual issue was that the gpu_agents and cpu_agents vectors were empty even when agents were discovered.
Fixed the conditional logic to properly call set_agents() when gpu_agents and cpu_agents are empty, regardless of the agent_manager state.

Added the `--gpu-events (-G)` option which sets the `ROCPROFSYS_ROCM_EVENTS` environment variable to the specified values.

Fixes an issue where unsupported GPU/APU arch is being skipped gracefully - more details about this issue in the below comment.
This commit is contained in:
Sajina PK
2026-01-09 11:59:45 -05:00
gecommit door GitHub
bovenliggende ebe22b5907
commit b3f59a37e4
5 gewijzigde bestanden met toevoegingen van 118 en 16 verwijderingen
@@ -519,6 +519,9 @@ main(int argc, char** argv)
} catch(std::runtime_error& _e)
{
verbprintf(0, "Retrieving the GPU HW counters failed: %s", _e.what());
} catch(std::exception& _e)
{
verbprintf(0, "Exception retrieving GPU HW counters: %s", _e.what());
}
verbprintf(1, "Found %i HIP devices and %zu GPU HW counters\n", gpu_count,
_num_metrics);
@@ -837,6 +837,18 @@ parse_args(int argc, char** argv, std::vector<char*>& _env)
original_envs);
});
parser
.add_argument({ "-G", "--gpu-events" },
"Set the GPU hardware counter events to record (ref: "
"`rocprof-sys-avail -H -c GPU`)")
.action([&](parser_t& p) {
auto _events =
join(array_config{ "," }, p.get<std::vector<std::string>>("gpu-events"));
rocprofsys::common::update_env(_env, "ROCPROFSYS_ROCM_EVENTS", _events,
update_mode::REPLACE, ":", updated_envs,
original_envs);
});
parser.start_group("MISCELLANEOUS OPTIONS", "");
parser
.add_argument({ "-i", "--inlines" },
@@ -1073,6 +1073,23 @@ add_core_arguments(parser_t& _parser, parser_data& _data)
_data.processed_environs.emplace("papi_events");
}
if(_data.environ_filter("gpu_events", _data))
{
_parser
.add_argument({ "-G", "--gpu-events" },
"Set the GPU hardware counter events to record (ref: "
"`rocprof-sys-avail -H -c GPU`)")
.min_count(1)
.dtype("[EVENT ...]")
.action([&](parser_t& p) {
auto _events = join(array_config_t{ "," }, p.get<strvec_t>("gpu-events"));
update_env(_data, "ROCPROFSYS_ROCM_EVENTS", _events);
});
_data.processed_environs.emplace("gpu_events");
_data.processed_environs.emplace("rocm_events");
}
add_group_arguments(_parser, "category", _data, true);
add_group_arguments(_parser, "io", _data, true);
add_group_arguments(_parser, "perfetto", _data, true);
@@ -213,6 +213,18 @@ create_agent_profile(rocprofiler_agent_id_t agent_id,
auto counters_v = counter_vec_t{};
const auto* tool_agent_v = data->get_gpu_tool_agent(agent_id);
// Check if agent info is available (may not be for unsupported architectures)
auto agent_info_it = data->agent_counter_info.find(agent_id);
if(agent_info_it == data->agent_counter_info.end())
{
ROCPROFSYS_WARNING_F(0,
"Skipping GPU agent %lu (device %lu) due to unsupported "
"architecture or missing counter info\n",
agent_id.handle, tool_agent_v->device_id);
data->agent_counter_profiles.emplace(agent_id, profile);
return counter_vec_t{};
}
constexpr auto device_qualifier = std::string_view{ ":device=" };
for(const auto& itr : counters)
{
@@ -263,7 +275,7 @@ create_agent_profile(rocprofiler_agent_id_t agent_id,
}
// search the gpu agent counter info for a counter with a matching name
for(const auto& citr : data->agent_counter_info.at(agent_id))
for(const auto& citr : agent_info_it->second)
{
if(name_v == std::string_view{ citr.name })
{
@@ -280,11 +292,34 @@ create_agent_profile(rocprofiler_agent_id_t agent_id,
auto found_counters =
timemory::join::join(timemory::join::array_config{ ", ", "", "" }, found_v);
ROCPROFSYS_ABORT_F(
"Unable to find all counters for agent %i (gpu-%li, %s) in %s. Found: %s\n",
// Determine which counters were not found
auto missing_counters = std::vector<std::string>{};
for(const auto& counter : counters)
{
if(std::find(found_v.begin(), found_v.end(), counter) == found_v.end())
missing_counters.emplace_back(counter);
}
auto missing_counters_str = timemory::join::join(
timemory::join::array_config{ ", ", "", "" }, missing_counters);
// In production, warn and continue with available counters
ROCPROFSYS_WARNING_F(0,
"Unable to find all counters for agent %i (gpu-%li, %s). "
"Requested: %s. Found: %s. Missing: %s. Continuing with "
"available counters.\n",
tool_agent_v->agent->node_id, tool_agent_v->device_id,
tool_agent_v->agent->name.c_str(),
requested_counters.c_str(), found_counters.c_str(),
missing_counters_str.c_str());
// In CI, throw to catch issues early
ROCPROFSYS_CI_THROW(
true,
"Unable to find all counters for agent %i (gpu-%li, %s). Requested: %s. "
"Found: %s. Missing: %s",
tool_agent_v->agent->node_id, tool_agent_v->device_id,
tool_agent_v->agent->name.c_str(), requested_counters.c_str(),
found_counters.c_str());
found_counters.c_str(), missing_counters_str.c_str());
}
if(!counters_v.empty())
@@ -93,18 +93,35 @@ get_agent_counter_info(const tool_agent_vec_t& _agents)
{
const auto& _agent_id = rocprofiler_agent_id_t{ itr.agent->handle };
ROCPROFILER_CALL(rocprofiler_iterate_agent_supported_counters(
_agent_id, counters_supported_callback, &_data));
auto status = rocprofiler_iterate_agent_supported_counters(
_agent_id, counters_supported_callback, &_data);
std::sort(_data.at(_agent_id).begin(), _data.at(_agent_id).end(),
[](const auto& lhs, const auto& rhs) {
return (lhs.id.handle < rhs.id.handle);
});
for(auto& citr : _data.at(_agent_id))
if(status != ROCPROFILER_STATUS_SUCCESS)
{
std::sort(citr.dimension_info.begin(), citr.dimension_info.end(),
[](const auto& lhs, const auto& rhs) { return (lhs.id < rhs.id); });
ROCPROFSYS_WARNING_F(
0,
"rocprofiler_iterate_agent_supported_counters failed for agent %lu "
"with status %d (Agent HW architecture may not be supported)\n",
_agent_id.handle, static_cast<int>(status));
// Skip processing for this agent if it's not supported
continue;
}
// Only process if the agent was successfully added to the map
auto agent_it = _data.find(_agent_id);
if(agent_it != _data.end())
{
std::sort(agent_it->second.begin(), agent_it->second.end(),
[](const auto& lhs, const auto& rhs) {
return (lhs.id.handle < rhs.id.handle);
});
for(auto& citr : agent_it->second)
{
std::sort(
citr.dimension_info.begin(), citr.dimension_info.end(),
[](const auto& lhs, const auto& rhs) { return (lhs.id < rhs.id); });
}
}
}
@@ -131,10 +148,16 @@ client_data::initialize()
void
client_data::initialize_event_info()
{
if(get_agent_manager_instance().get_agents().empty())
auto& agent_mngr = get_agent_manager_instance();
if(agent_mngr.get_agents().empty())
{
initialize();
}
else if(gpu_agents.empty() && cpu_agents.empty())
{
set_agents();
}
if(agent_counter_info.size() != gpu_agents.size())
agent_counter_info = get_agent_counter_info(gpu_agents);
@@ -154,7 +177,19 @@ client_data::initialize_event_info()
_device_qualifier_sym,
JOIN(" ", "Device", _dev_index) };
auto _counter_info = agent_counter_info.at(_agent_id);
// Check if agent info is available ( i.e., counters are supported)
auto agent_info_it = agent_counter_info.find(_agent_id);
if(agent_info_it == agent_counter_info.end())
{
ROCPROFSYS_WARNING_F(0,
"Skipping GPU device %lu (%s, handle=0x%lx) due to "
"counter not found for the specified architecture\n",
_dev_index, aitr.agent->name.c_str(),
aitr.agent->handle);
continue;
}
auto _counter_info = agent_info_it->second;
std::sort(_counter_info.begin(), _counter_info.end(),
[](const rocprofiler_tool_counter_info_t& lhs,
const rocprofiler_tool_counter_info_t& rhs) {