[Rocprofiler-system]: Fix GPU event enumeration for rocprof-sys-avail and CLI option for parsing GPU HW Counters (#2476)
## Motivation The `rocprof-sys-avail -H -c GPU` command is returning blank output which is expected to display a list of available GPU hardware counters instead. The `rocprof-sys-sample` and `rocprof-sys-run` is missing the `--gpu-events` option for specifying GPU counter events during profiling. ## Technical Details The initialize_event_info() function had a logic bug where it only called set_agents() if the agent_manager was empty, but the actual issue was that the gpu_agents and cpu_agents vectors were empty even when agents were discovered. Fixed the conditional logic to properly call set_agents() when gpu_agents and cpu_agents are empty, regardless of the agent_manager state. Added the `--gpu-events (-G)` option which sets the `ROCPROFSYS_ROCM_EVENTS` environment variable to the specified values. Fixes an issue where unsupported GPU/APU arch is being skipped gracefully - more details about this issue in the below comment.
This commit is contained in:
gecommit door
GitHub
bovenliggende
ebe22b5907
commit
b3f59a37e4
@@ -519,6 +519,9 @@ main(int argc, char** argv)
|
||||
} catch(std::runtime_error& _e)
|
||||
{
|
||||
verbprintf(0, "Retrieving the GPU HW counters failed: %s", _e.what());
|
||||
} catch(std::exception& _e)
|
||||
{
|
||||
verbprintf(0, "Exception retrieving GPU HW counters: %s", _e.what());
|
||||
}
|
||||
verbprintf(1, "Found %i HIP devices and %zu GPU HW counters\n", gpu_count,
|
||||
_num_metrics);
|
||||
|
||||
@@ -837,6 +837,18 @@ parse_args(int argc, char** argv, std::vector<char*>& _env)
|
||||
original_envs);
|
||||
});
|
||||
|
||||
parser
|
||||
.add_argument({ "-G", "--gpu-events" },
|
||||
"Set the GPU hardware counter events to record (ref: "
|
||||
"`rocprof-sys-avail -H -c GPU`)")
|
||||
.action([&](parser_t& p) {
|
||||
auto _events =
|
||||
join(array_config{ "," }, p.get<std::vector<std::string>>("gpu-events"));
|
||||
rocprofsys::common::update_env(_env, "ROCPROFSYS_ROCM_EVENTS", _events,
|
||||
update_mode::REPLACE, ":", updated_envs,
|
||||
original_envs);
|
||||
});
|
||||
|
||||
parser.start_group("MISCELLANEOUS OPTIONS", "");
|
||||
parser
|
||||
.add_argument({ "-i", "--inlines" },
|
||||
|
||||
@@ -1073,6 +1073,23 @@ add_core_arguments(parser_t& _parser, parser_data& _data)
|
||||
_data.processed_environs.emplace("papi_events");
|
||||
}
|
||||
|
||||
if(_data.environ_filter("gpu_events", _data))
|
||||
{
|
||||
_parser
|
||||
.add_argument({ "-G", "--gpu-events" },
|
||||
"Set the GPU hardware counter events to record (ref: "
|
||||
"`rocprof-sys-avail -H -c GPU`)")
|
||||
.min_count(1)
|
||||
.dtype("[EVENT ...]")
|
||||
.action([&](parser_t& p) {
|
||||
auto _events = join(array_config_t{ "," }, p.get<strvec_t>("gpu-events"));
|
||||
update_env(_data, "ROCPROFSYS_ROCM_EVENTS", _events);
|
||||
});
|
||||
|
||||
_data.processed_environs.emplace("gpu_events");
|
||||
_data.processed_environs.emplace("rocm_events");
|
||||
}
|
||||
|
||||
add_group_arguments(_parser, "category", _data, true);
|
||||
add_group_arguments(_parser, "io", _data, true);
|
||||
add_group_arguments(_parser, "perfetto", _data, true);
|
||||
|
||||
+39
-4
@@ -213,6 +213,18 @@ create_agent_profile(rocprofiler_agent_id_t agent_id,
|
||||
auto counters_v = counter_vec_t{};
|
||||
const auto* tool_agent_v = data->get_gpu_tool_agent(agent_id);
|
||||
|
||||
// Check if agent info is available (may not be for unsupported architectures)
|
||||
auto agent_info_it = data->agent_counter_info.find(agent_id);
|
||||
if(agent_info_it == data->agent_counter_info.end())
|
||||
{
|
||||
ROCPROFSYS_WARNING_F(0,
|
||||
"Skipping GPU agent %lu (device %lu) due to unsupported "
|
||||
"architecture or missing counter info\n",
|
||||
agent_id.handle, tool_agent_v->device_id);
|
||||
data->agent_counter_profiles.emplace(agent_id, profile);
|
||||
return counter_vec_t{};
|
||||
}
|
||||
|
||||
constexpr auto device_qualifier = std::string_view{ ":device=" };
|
||||
for(const auto& itr : counters)
|
||||
{
|
||||
@@ -263,7 +275,7 @@ create_agent_profile(rocprofiler_agent_id_t agent_id,
|
||||
}
|
||||
|
||||
// search the gpu agent counter info for a counter with a matching name
|
||||
for(const auto& citr : data->agent_counter_info.at(agent_id))
|
||||
for(const auto& citr : agent_info_it->second)
|
||||
{
|
||||
if(name_v == std::string_view{ citr.name })
|
||||
{
|
||||
@@ -280,11 +292,34 @@ create_agent_profile(rocprofiler_agent_id_t agent_id,
|
||||
auto found_counters =
|
||||
timemory::join::join(timemory::join::array_config{ ", ", "", "" }, found_v);
|
||||
|
||||
ROCPROFSYS_ABORT_F(
|
||||
"Unable to find all counters for agent %i (gpu-%li, %s) in %s. Found: %s\n",
|
||||
// Determine which counters were not found
|
||||
auto missing_counters = std::vector<std::string>{};
|
||||
for(const auto& counter : counters)
|
||||
{
|
||||
if(std::find(found_v.begin(), found_v.end(), counter) == found_v.end())
|
||||
missing_counters.emplace_back(counter);
|
||||
}
|
||||
auto missing_counters_str = timemory::join::join(
|
||||
timemory::join::array_config{ ", ", "", "" }, missing_counters);
|
||||
|
||||
// In production, warn and continue with available counters
|
||||
ROCPROFSYS_WARNING_F(0,
|
||||
"Unable to find all counters for agent %i (gpu-%li, %s). "
|
||||
"Requested: %s. Found: %s. Missing: %s. Continuing with "
|
||||
"available counters.\n",
|
||||
tool_agent_v->agent->node_id, tool_agent_v->device_id,
|
||||
tool_agent_v->agent->name.c_str(),
|
||||
requested_counters.c_str(), found_counters.c_str(),
|
||||
missing_counters_str.c_str());
|
||||
|
||||
// In CI, throw to catch issues early
|
||||
ROCPROFSYS_CI_THROW(
|
||||
true,
|
||||
"Unable to find all counters for agent %i (gpu-%li, %s). Requested: %s. "
|
||||
"Found: %s. Missing: %s",
|
||||
tool_agent_v->agent->node_id, tool_agent_v->device_id,
|
||||
tool_agent_v->agent->name.c_str(), requested_counters.c_str(),
|
||||
found_counters.c_str());
|
||||
found_counters.c_str(), missing_counters_str.c_str());
|
||||
}
|
||||
|
||||
if(!counters_v.empty())
|
||||
|
||||
+47
-12
@@ -93,18 +93,35 @@ get_agent_counter_info(const tool_agent_vec_t& _agents)
|
||||
{
|
||||
const auto& _agent_id = rocprofiler_agent_id_t{ itr.agent->handle };
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_iterate_agent_supported_counters(
|
||||
_agent_id, counters_supported_callback, &_data));
|
||||
auto status = rocprofiler_iterate_agent_supported_counters(
|
||||
_agent_id, counters_supported_callback, &_data);
|
||||
|
||||
std::sort(_data.at(_agent_id).begin(), _data.at(_agent_id).end(),
|
||||
[](const auto& lhs, const auto& rhs) {
|
||||
return (lhs.id.handle < rhs.id.handle);
|
||||
});
|
||||
|
||||
for(auto& citr : _data.at(_agent_id))
|
||||
if(status != ROCPROFILER_STATUS_SUCCESS)
|
||||
{
|
||||
std::sort(citr.dimension_info.begin(), citr.dimension_info.end(),
|
||||
[](const auto& lhs, const auto& rhs) { return (lhs.id < rhs.id); });
|
||||
ROCPROFSYS_WARNING_F(
|
||||
0,
|
||||
"rocprofiler_iterate_agent_supported_counters failed for agent %lu "
|
||||
"with status %d (Agent HW architecture may not be supported)\n",
|
||||
_agent_id.handle, static_cast<int>(status));
|
||||
// Skip processing for this agent if it's not supported
|
||||
continue;
|
||||
}
|
||||
|
||||
// Only process if the agent was successfully added to the map
|
||||
auto agent_it = _data.find(_agent_id);
|
||||
if(agent_it != _data.end())
|
||||
{
|
||||
std::sort(agent_it->second.begin(), agent_it->second.end(),
|
||||
[](const auto& lhs, const auto& rhs) {
|
||||
return (lhs.id.handle < rhs.id.handle);
|
||||
});
|
||||
|
||||
for(auto& citr : agent_it->second)
|
||||
{
|
||||
std::sort(
|
||||
citr.dimension_info.begin(), citr.dimension_info.end(),
|
||||
[](const auto& lhs, const auto& rhs) { return (lhs.id < rhs.id); });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -131,10 +148,16 @@ client_data::initialize()
|
||||
void
|
||||
client_data::initialize_event_info()
|
||||
{
|
||||
if(get_agent_manager_instance().get_agents().empty())
|
||||
auto& agent_mngr = get_agent_manager_instance();
|
||||
|
||||
if(agent_mngr.get_agents().empty())
|
||||
{
|
||||
initialize();
|
||||
}
|
||||
else if(gpu_agents.empty() && cpu_agents.empty())
|
||||
{
|
||||
set_agents();
|
||||
}
|
||||
|
||||
if(agent_counter_info.size() != gpu_agents.size())
|
||||
agent_counter_info = get_agent_counter_info(gpu_agents);
|
||||
@@ -154,7 +177,19 @@ client_data::initialize_event_info()
|
||||
_device_qualifier_sym,
|
||||
JOIN(" ", "Device", _dev_index) };
|
||||
|
||||
auto _counter_info = agent_counter_info.at(_agent_id);
|
||||
// Check if agent info is available ( i.e., counters are supported)
|
||||
auto agent_info_it = agent_counter_info.find(_agent_id);
|
||||
if(agent_info_it == agent_counter_info.end())
|
||||
{
|
||||
ROCPROFSYS_WARNING_F(0,
|
||||
"Skipping GPU device %lu (%s, handle=0x%lx) due to "
|
||||
"counter not found for the specified architecture\n",
|
||||
_dev_index, aitr.agent->name.c_str(),
|
||||
aitr.agent->handle);
|
||||
continue;
|
||||
}
|
||||
|
||||
auto _counter_info = agent_info_it->second;
|
||||
std::sort(_counter_info.begin(), _counter_info.end(),
|
||||
[](const rocprofiler_tool_counter_info_t& lhs,
|
||||
const rocprofiler_tool_counter_info_t& rhs) {
|
||||
|
||||
Verwijs in nieuw issue
Block a user