diff --git a/projects/rocprofiler-systems/source/bin/rocprof-sys-avail/avail.cpp b/projects/rocprofiler-systems/source/bin/rocprof-sys-avail/avail.cpp index a62ea441d0..4258ff9524 100644 --- a/projects/rocprofiler-systems/source/bin/rocprof-sys-avail/avail.cpp +++ b/projects/rocprofiler-systems/source/bin/rocprof-sys-avail/avail.cpp @@ -519,6 +519,9 @@ main(int argc, char** argv) } catch(std::runtime_error& _e) { verbprintf(0, "Retrieving the GPU HW counters failed: %s", _e.what()); + } catch(std::exception& _e) + { + verbprintf(0, "Exception retrieving GPU HW counters: %s", _e.what()); } verbprintf(1, "Found %i HIP devices and %zu GPU HW counters\n", gpu_count, _num_metrics); diff --git a/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/impl.cpp b/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/impl.cpp index 124217b102..9491315aeb 100644 --- a/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/impl.cpp +++ b/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/impl.cpp @@ -837,6 +837,18 @@ parse_args(int argc, char** argv, std::vector& _env) original_envs); }); + parser + .add_argument({ "-G", "--gpu-events" }, + "Set the GPU hardware counter events to record (ref: " + "`rocprof-sys-avail -H -c GPU`)") + .action([&](parser_t& p) { + auto _events = + join(array_config{ "," }, p.get>("gpu-events")); + rocprofsys::common::update_env(_env, "ROCPROFSYS_ROCM_EVENTS", _events, + update_mode::REPLACE, ":", updated_envs, + original_envs); + }); + parser.start_group("MISCELLANEOUS OPTIONS", ""); parser .add_argument({ "-i", "--inlines" }, diff --git a/projects/rocprofiler-systems/source/lib/core/argparse.cpp b/projects/rocprofiler-systems/source/lib/core/argparse.cpp index 443c217bc5..e7ec3d50e4 100644 --- a/projects/rocprofiler-systems/source/lib/core/argparse.cpp +++ b/projects/rocprofiler-systems/source/lib/core/argparse.cpp @@ -1073,6 +1073,23 @@ add_core_arguments(parser_t& _parser, parser_data& _data) _data.processed_environs.emplace("papi_events"); } + if(_data.environ_filter("gpu_events", _data)) + { + _parser + .add_argument({ "-G", "--gpu-events" }, + "Set the GPU hardware counter events to record (ref: " + "`rocprof-sys-avail -H -c GPU`)") + .min_count(1) + .dtype("[EVENT ...]") + .action([&](parser_t& p) { + auto _events = join(array_config_t{ "," }, p.get("gpu-events")); + update_env(_data, "ROCPROFSYS_ROCM_EVENTS", _events); + }); + + _data.processed_environs.emplace("gpu_events"); + _data.processed_environs.emplace("rocm_events"); + } + add_group_arguments(_parser, "category", _data, true); add_group_arguments(_parser, "io", _data, true); add_group_arguments(_parser, "perfetto", _data, true); diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp index 9471bd0321..d73e47c33b 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp @@ -213,6 +213,18 @@ create_agent_profile(rocprofiler_agent_id_t agent_id, auto counters_v = counter_vec_t{}; const auto* tool_agent_v = data->get_gpu_tool_agent(agent_id); + // Check if agent info is available (may not be for unsupported architectures) + auto agent_info_it = data->agent_counter_info.find(agent_id); + if(agent_info_it == data->agent_counter_info.end()) + { + ROCPROFSYS_WARNING_F(0, + "Skipping GPU agent %lu (device %lu) due to unsupported " + "architecture or missing counter info\n", + agent_id.handle, tool_agent_v->device_id); + data->agent_counter_profiles.emplace(agent_id, profile); + return counter_vec_t{}; + } + constexpr auto device_qualifier = std::string_view{ ":device=" }; for(const auto& itr : counters) { @@ -263,7 +275,7 @@ create_agent_profile(rocprofiler_agent_id_t agent_id, } // search the gpu agent counter info for a counter with a matching name - for(const auto& citr : data->agent_counter_info.at(agent_id)) + for(const auto& citr : agent_info_it->second) { if(name_v == std::string_view{ citr.name }) { @@ -280,11 +292,34 @@ create_agent_profile(rocprofiler_agent_id_t agent_id, auto found_counters = timemory::join::join(timemory::join::array_config{ ", ", "", "" }, found_v); - ROCPROFSYS_ABORT_F( - "Unable to find all counters for agent %i (gpu-%li, %s) in %s. Found: %s\n", + // Determine which counters were not found + auto missing_counters = std::vector{}; + for(const auto& counter : counters) + { + if(std::find(found_v.begin(), found_v.end(), counter) == found_v.end()) + missing_counters.emplace_back(counter); + } + auto missing_counters_str = timemory::join::join( + timemory::join::array_config{ ", ", "", "" }, missing_counters); + + // In production, warn and continue with available counters + ROCPROFSYS_WARNING_F(0, + "Unable to find all counters for agent %i (gpu-%li, %s). " + "Requested: %s. Found: %s. Missing: %s. Continuing with " + "available counters.\n", + tool_agent_v->agent->node_id, tool_agent_v->device_id, + tool_agent_v->agent->name.c_str(), + requested_counters.c_str(), found_counters.c_str(), + missing_counters_str.c_str()); + + // In CI, throw to catch issues early + ROCPROFSYS_CI_THROW( + true, + "Unable to find all counters for agent %i (gpu-%li, %s). Requested: %s. " + "Found: %s. Missing: %s", tool_agent_v->agent->node_id, tool_agent_v->device_id, tool_agent_v->agent->name.c_str(), requested_counters.c_str(), - found_counters.c_str()); + found_counters.c_str(), missing_counters_str.c_str()); } if(!counters_v.empty()) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/fwd.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/fwd.cpp index 06b2756e66..b1e6ee57b7 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/fwd.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/fwd.cpp @@ -93,18 +93,35 @@ get_agent_counter_info(const tool_agent_vec_t& _agents) { const auto& _agent_id = rocprofiler_agent_id_t{ itr.agent->handle }; - ROCPROFILER_CALL(rocprofiler_iterate_agent_supported_counters( - _agent_id, counters_supported_callback, &_data)); + auto status = rocprofiler_iterate_agent_supported_counters( + _agent_id, counters_supported_callback, &_data); - std::sort(_data.at(_agent_id).begin(), _data.at(_agent_id).end(), - [](const auto& lhs, const auto& rhs) { - return (lhs.id.handle < rhs.id.handle); - }); - - for(auto& citr : _data.at(_agent_id)) + if(status != ROCPROFILER_STATUS_SUCCESS) { - std::sort(citr.dimension_info.begin(), citr.dimension_info.end(), - [](const auto& lhs, const auto& rhs) { return (lhs.id < rhs.id); }); + ROCPROFSYS_WARNING_F( + 0, + "rocprofiler_iterate_agent_supported_counters failed for agent %lu " + "with status %d (Agent HW architecture may not be supported)\n", + _agent_id.handle, static_cast(status)); + // Skip processing for this agent if it's not supported + continue; + } + + // Only process if the agent was successfully added to the map + auto agent_it = _data.find(_agent_id); + if(agent_it != _data.end()) + { + std::sort(agent_it->second.begin(), agent_it->second.end(), + [](const auto& lhs, const auto& rhs) { + return (lhs.id.handle < rhs.id.handle); + }); + + for(auto& citr : agent_it->second) + { + std::sort( + citr.dimension_info.begin(), citr.dimension_info.end(), + [](const auto& lhs, const auto& rhs) { return (lhs.id < rhs.id); }); + } } } @@ -131,10 +148,16 @@ client_data::initialize() void client_data::initialize_event_info() { - if(get_agent_manager_instance().get_agents().empty()) + auto& agent_mngr = get_agent_manager_instance(); + + if(agent_mngr.get_agents().empty()) { initialize(); } + else if(gpu_agents.empty() && cpu_agents.empty()) + { + set_agents(); + } if(agent_counter_info.size() != gpu_agents.size()) agent_counter_info = get_agent_counter_info(gpu_agents); @@ -154,7 +177,19 @@ client_data::initialize_event_info() _device_qualifier_sym, JOIN(" ", "Device", _dev_index) }; - auto _counter_info = agent_counter_info.at(_agent_id); + // Check if agent info is available ( i.e., counters are supported) + auto agent_info_it = agent_counter_info.find(_agent_id); + if(agent_info_it == agent_counter_info.end()) + { + ROCPROFSYS_WARNING_F(0, + "Skipping GPU device %lu (%s, handle=0x%lx) due to " + "counter not found for the specified architecture\n", + _dev_index, aitr.agent->name.c_str(), + aitr.agent->handle); + continue; + } + + auto _counter_info = agent_info_it->second; std::sort(_counter_info.begin(), _counter_info.end(), [](const rocprofiler_tool_counter_info_t& lhs, const rocprofiler_tool_counter_info_t& rhs) {