1ba08cd4df
* Removing SQTT buffer size limitation * Update source/lib/rocprofiler-sdk/thread_trace/core.cpp * Added testing for buffer size. Formatting. * Add test as unstable * Increase default buffer size * Apply suggestions from code review Co-authored-by: Indic, Vladimir <Vladimir.Indic@amd.com> * Fix typo from code review * Update tests/thread-trace/agent.cpp --------- Co-authored-by: Giovanni <gbaraldi@amd.com> Co-authored-by: Indic, Vladimir <Vladimir.Indic@amd.com>
268 строки
12 KiB
C++
268 строки
12 KiB
C++
// MIT License
|
|
//
|
|
// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files (the "Software"), to deal
|
|
// in the Software without restriction, including without limitation the rights
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions:
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in all
|
|
// copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
// SOFTWARE.
|
|
|
|
#include "lib/rocprofiler-sdk/aql/packet_construct.hpp"
|
|
#include "lib/common/logging.hpp"
|
|
#include "lib/rocprofiler-sdk/hsa/details/fmt.hpp"
|
|
|
|
#include <fmt/core.h>
|
|
#include <hsa/hsa_ext_amd.h>
|
|
#include "glog/logging.h"
|
|
|
|
#define CHECK_HSA(fn, message) \
|
|
{ \
|
|
auto status = (fn); \
|
|
if(status != HSA_STATUS_SUCCESS) \
|
|
{ \
|
|
ROCP_FATAL << "HSA Err: " << status << "\n"; \
|
|
exit(1); \
|
|
} \
|
|
}
|
|
|
|
namespace rocprofiler
|
|
{
|
|
namespace aql
|
|
{
|
|
CounterPacketConstruct::CounterPacketConstruct(rocprofiler_agent_id_t agent,
|
|
const std::vector<counters::Metric>& metrics)
|
|
: _agent(agent)
|
|
{
|
|
// Validate that the counter exists and construct the block instances
|
|
// for the counter.
|
|
for(const auto& x : metrics)
|
|
{
|
|
auto query_info = get_query_info(_agent, x);
|
|
_metrics.emplace_back().metric = x;
|
|
uint64_t event_id = 0;
|
|
if(!x.event().empty()) event_id = std::stoul(x.event(), nullptr);
|
|
ROCP_TRACE << fmt::format("Fetching events for counter {} (id={}, instance_count={}) on "
|
|
"agent {} (node-id:{})(name:{})",
|
|
x.name(),
|
|
event_id,
|
|
query_info.instance_count,
|
|
agent.handle,
|
|
rocprofiler::agent::get_agent(agent)->node_id,
|
|
rocprofiler::agent::get_agent(agent)->name);
|
|
|
|
for(unsigned block_index = 0; block_index < query_info.instance_count; ++block_index)
|
|
{
|
|
_metrics.back().instances.push_back(
|
|
{.block_index = block_index,
|
|
.event_id = static_cast<uint32_t>(event_id & 0xFFFFFFFF),
|
|
.flags = aqlprofile_pmc_event_flags_t{x.flags()},
|
|
.block_name = static_cast<hsa_ven_amd_aqlprofile_block_name_t>(query_info.id)});
|
|
|
|
_metrics.back().events.push_back(
|
|
{.block_index = block_index,
|
|
.event_id = static_cast<uint32_t>(event_id & 0xFFFFFFFF),
|
|
.flags = aqlprofile_pmc_event_flags_t{x.flags()},
|
|
.block_name = static_cast<hsa_ven_amd_aqlprofile_block_name_t>(query_info.id)});
|
|
|
|
bool validate_event_result;
|
|
|
|
auto aql_agent = *CHECK_NOTNULL(rocprofiler::agent::get_aql_agent(agent));
|
|
|
|
LOG_IF(FATAL,
|
|
aqlprofile_validate_pmc_event(aql_agent,
|
|
&_metrics.back().events.back(),
|
|
&validate_event_result) != HSA_STATUS_SUCCESS);
|
|
ROCP_FATAL_IF(!validate_event_result)
|
|
<< "Invalid Metric: " << block_index << " " << event_id;
|
|
_event_to_metric[_metrics.back().events.back()] = x;
|
|
}
|
|
}
|
|
_events = get_all_events();
|
|
}
|
|
|
|
std::unique_ptr<hsa::CounterAQLPacket>
|
|
CounterPacketConstruct::construct_packet(const CoreApiTable& coreapi, const AmdExtTable& ext)
|
|
{
|
|
const auto* agent =
|
|
rocprofiler::agent::get_agent_cache(CHECK_NOTNULL(rocprofiler::agent::get_agent(_agent)));
|
|
if(!agent) ROCP_FATAL << "No agent cache for agent id: " << _agent.handle;
|
|
|
|
hsa_amd_memory_pool_access_t _access = HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED;
|
|
ext.hsa_amd_agent_memory_pool_get_info_fn(agent->get_hsa_agent(),
|
|
agent->kernarg_pool(),
|
|
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
|
|
static_cast<void*>(&_access));
|
|
|
|
hsa::CounterAQLPacket::CounterMemoryPool pool;
|
|
|
|
if(_access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) pool.bIgnoreKernArg = true;
|
|
|
|
pool.allocate_fn = ext.hsa_amd_memory_pool_allocate_fn;
|
|
pool.allow_access_fn = ext.hsa_amd_agents_allow_access_fn;
|
|
pool.free_fn = ext.hsa_amd_memory_pool_free_fn;
|
|
pool.api_copy_fn = coreapi.hsa_memory_copy_fn;
|
|
pool.fill_fn = ext.hsa_amd_memory_fill_fn;
|
|
|
|
pool.gpu_agent = agent->get_hsa_agent();
|
|
pool.cpu_pool_ = agent->cpu_pool();
|
|
pool.kernarg_pool_ = agent->kernarg_pool();
|
|
|
|
const auto* aql_agent = rocprofiler::agent::get_aql_agent(agent->get_rocp_agent()->id);
|
|
if(aql_agent == nullptr) throw std::runtime_error("Could not get AQL agent!");
|
|
|
|
if(_events.empty()) ROCP_TRACE << "No events for pkt";
|
|
|
|
return std::make_unique<hsa::CounterAQLPacket>(*aql_agent, pool, _events);
|
|
}
|
|
|
|
ThreadTraceAQLPacketFactory::ThreadTraceAQLPacketFactory(const hsa::AgentCache& agent,
|
|
const thread_trace_parameter_pack& params,
|
|
const CoreApiTable& coreapi,
|
|
const AmdExtTable& ext)
|
|
{
|
|
this->tracepool = hsa::TraceMemoryPool{};
|
|
this->tracepool.allocate_fn = ext.hsa_amd_memory_pool_allocate_fn;
|
|
this->tracepool.allow_access_fn = ext.hsa_amd_agents_allow_access_fn;
|
|
this->tracepool.free_fn = ext.hsa_amd_memory_pool_free_fn;
|
|
this->tracepool.api_copy_fn = coreapi.hsa_memory_copy_fn;
|
|
this->tracepool.gpu_agent = agent.get_hsa_agent();
|
|
this->tracepool.cpu_pool_ = agent.cpu_pool();
|
|
this->tracepool.gpu_pool_ = agent.gpu_pool();
|
|
|
|
uint32_t cu = static_cast<uint32_t>(params.target_cu);
|
|
uint32_t shader_engine_mask = static_cast<uint32_t>(params.shader_engine_mask);
|
|
uint32_t simd = static_cast<uint32_t>(params.simd_select);
|
|
uint32_t buffer_size_lo = static_cast<uint32_t>(params.buffer_size);
|
|
uint32_t buffer_size_hi = static_cast<uint32_t>(params.buffer_size >> 32);
|
|
uint32_t perf_ctrl = static_cast<uint32_t>(params.perfcounter_ctrl);
|
|
|
|
aql_params.clear();
|
|
|
|
aql_params.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET, {cu}});
|
|
aql_params.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK, {shader_engine_mask}});
|
|
aql_params.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SIMD_SELECTION, {simd}});
|
|
aql_params.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE, {buffer_size_lo}});
|
|
|
|
if(buffer_size_hi != 0) aql_params.push_back({static_cast<hsa_ven_amd_aqlprofile_parameter_name_t>(
|
|
AQLPROFILE_ATT_PARAMETER_NAME_BUFFER_SIZE_HIGH), {buffer_size_hi}});
|
|
|
|
if(perf_ctrl != 0 && !params.perfcounters.empty())
|
|
{
|
|
for(const auto& perf_counter : params.perfcounters)
|
|
{
|
|
aqlprofile_att_parameter_t param{};
|
|
param.parameter_name = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_NAME;
|
|
param.counter_id = perf_counter.first;
|
|
param.simd_mask = perf_counter.second;
|
|
aql_params.push_back(param);
|
|
}
|
|
|
|
aqlprofile_att_parameter_t param{};
|
|
param.parameter_name = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_CTRL;
|
|
param.value = perf_ctrl - 1;
|
|
aql_params.push_back(param);
|
|
}
|
|
}
|
|
|
|
std::unique_ptr<hsa::TraceControlAQLPacket>
|
|
ThreadTraceAQLPacketFactory::construct_control_packet()
|
|
{
|
|
auto num_params = static_cast<uint32_t>(aql_params.size());
|
|
auto profile = aqlprofile_att_profile_t{tracepool.gpu_agent, aql_params.data(), num_params};
|
|
auto packet = std::make_unique<hsa::TraceControlAQLPacket>(this->tracepool, profile);
|
|
packet->clear();
|
|
return packet;
|
|
}
|
|
|
|
std::unique_ptr<hsa::CodeobjMarkerAQLPacket>
|
|
ThreadTraceAQLPacketFactory::construct_load_marker_packet(uint64_t id, uint64_t addr, uint64_t size)
|
|
{
|
|
return std::make_unique<hsa::CodeobjMarkerAQLPacket>(tracepool, id, addr, size, false, false);
|
|
}
|
|
|
|
std::unique_ptr<hsa::CodeobjMarkerAQLPacket>
|
|
ThreadTraceAQLPacketFactory::construct_unload_marker_packet(uint64_t id)
|
|
{
|
|
return std::make_unique<hsa::CodeobjMarkerAQLPacket>(tracepool, id, 0, 0, false, true);
|
|
}
|
|
|
|
std::vector<aqlprofile_pmc_event_t>
|
|
CounterPacketConstruct::get_all_events() const
|
|
{
|
|
std::vector<aqlprofile_pmc_event_t> ret;
|
|
for(const auto& metric : _metrics)
|
|
{
|
|
ret.insert(ret.end(), metric.instances.begin(), metric.instances.end());
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
const counters::Metric*
|
|
CounterPacketConstruct::event_to_metric(const aqlprofile_pmc_event_t& event) const
|
|
{
|
|
if(const auto* ptr = rocprofiler::common::get_val(_event_to_metric, event))
|
|
{
|
|
return ptr;
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
const std::vector<aqlprofile_pmc_event_t>&
|
|
CounterPacketConstruct::get_counter_events(const counters::Metric& metric) const
|
|
{
|
|
for(const auto& prof_metric : _metrics)
|
|
{
|
|
if(prof_metric.metric.id() == metric.id())
|
|
{
|
|
return prof_metric.events;
|
|
}
|
|
}
|
|
throw std::runtime_error(fmt::format("Cannot Find Events for {}", metric));
|
|
}
|
|
|
|
rocprofiler_status_t
|
|
CounterPacketConstruct::can_collect()
|
|
{
|
|
// Verify that the counters fit within harrdware limits
|
|
std::map<std::pair<hsa_ven_amd_aqlprofile_block_name_t, uint32_t>, int64_t> counter_count;
|
|
std::map<std::pair<hsa_ven_amd_aqlprofile_block_name_t, uint32_t>, int64_t> max_allowed;
|
|
for(auto& metric : _metrics)
|
|
{
|
|
for(auto& instance : metric.events)
|
|
{
|
|
auto block_pair = std::make_pair(instance.block_name, instance.block_index);
|
|
auto [iter, inserted] = counter_count.emplace(block_pair, 0);
|
|
iter->second++;
|
|
if(inserted)
|
|
{
|
|
max_allowed.emplace(block_pair, get_block_counters(_agent, instance));
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check if the block count > max count
|
|
for(auto& [block_name, count] : counter_count)
|
|
{
|
|
if(auto* max = CHECK_NOTNULL(common::get_val(max_allowed, block_name)); count > *max)
|
|
{
|
|
return ROCPROFILER_STATUS_ERROR_EXCEEDS_HW_LIMIT;
|
|
}
|
|
}
|
|
return ROCPROFILER_STATUS_SUCCESS;
|
|
}
|
|
} // namespace aql
|
|
} // namespace rocprofiler
|