Files
rocm-systems/source/lib/rocprofiler/aql/packet_construct.cpp
T
Saurabh Verma 63775f241a Evaluation portion for metrics (#123)
* EvaluateAST and validation of RawAST

* Adding MetricDimension class and concepts

* set_dimensions() and improved ValidateRawAST()

* source formatting (clang-format v11) (#124)

Co-authored-by: bwelton <bwelton@users.noreply.github.com>

* Addressing 1st round of review comments

* Modified the parser production rules to support the right syntax for REDUCE and SELECT derived metric expressions

* changes to raw_ast.hpp and fmt::format()

* Parser tests updated to support corrected REDUCE and SELECT syntax

* changes to EvaluateAST::set_dimensions() and other dimension related code changes

* Added a test for EvaluateAST::evaluate() to test basic arithmetic on EvaluateAST

* Format source code (via clang-format v11) on sauverma/evaluate-ast (#146)

* source formatting (clang-format v11)

* Add dimension information to counter record

Restructures counter records to have the following design:

rocprofiler_record_id_t which is an int64_t that encodes
both the counter id and dimension information for the
record. The first 16 bits are reserved for the counter id while
the last 48 are split among the dimensions specified in
rocprofiler_dimension_t (currently 8 bits per dimension).
Each of the 8 bits for the dimension stores the dimension
value for that dimension for this record (i.e. a value of 8
on dimension XCC would denote XCC[8] for the counter). The
split among the dimensions will automatically adjust as
dimensions are added or removed.

The record also contains a union of {int64_t hw_counter, double
derived_counter} to specify the value of the record at
rocprofiler_record_id_t. int64_t denotes a physical hardware
counter that has integer types while the double is used for derived
counters (which type this counters values are needs to be queried
separately).

* Integration of new id type + other fixes

---------

Co-authored-by: sauverma93 <sauverma93@users.noreply.github.com>
Co-authored-by: Benjamin Welton <bewelton@amd.com>

* Fixed sissues with reduce() implementation and added a test for reduce()

* Updated parser syntax for reduce() and updated the parser test. Disabled the test for select()

* Build warning fixes

* Modifications to support fetching xcc/etc info from agent

* Initial plumbing working for single counters, cleanup+tests still needed

* Remove string comparison from reduce ops

* source formatting (clang-format v11) (#163)

Co-authored-by: bwelton <bwelton@users.noreply.github.com>

* cmake formatting (cmake-format) (#164)

Co-authored-by: bwelton <bwelton@users.noreply.github.com>

* source formatting (clang-format v11) (#171)

Co-authored-by: bwelton <bwelton@users.noreply.github.com>

* Merged with master

* source formatting (clang-format v11) (#172)

Co-authored-by: bwelton <bwelton@users.noreply.github.com>

* source formatting (clang-format v11) (#173)

Co-authored-by: bwelton <bwelton@users.noreply.github.com>

* Test fix

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: bwelton <bwelton@users.noreply.github.com>
Co-authored-by: sauverma93 <sauverma93@users.noreply.github.com>
Co-authored-by: Benjamin Welton <bewelton@amd.com>
2023-11-03 21:10:40 -07:00

207 行
7.7 KiB
C++

#include "lib/rocprofiler/aql/packet_construct.hpp"
#include <fmt/core.h>
#include <hsa/hsa_ext_amd.h>
#include "glog/logging.h"
namespace rocprofiler
{
namespace aql
{
AQLPacketConstruct::AQLPacketConstruct(const hsa::AgentCache& agent,
const std::vector<counters::Metric>& metrics)
: _agent(agent)
{
if(metrics.empty())
{
throw std::runtime_error("No metrics supplied");
}
// Validate that the counter exists and construct the block instances
// for the counter.
for(const auto& x : metrics)
{
auto query_info = get_query_info(_agent.get_hsa_agent(), x);
_metrics.emplace_back().metric = x;
uint32_t event_id = std::atoi(x.event().c_str());
for(unsigned block_index = 0; block_index < query_info.instance_count; ++block_index)
{
_metrics.back().instances.push_back(
{static_cast<hsa_ven_amd_aqlprofile_block_name_t>(query_info.id),
block_index,
event_id});
bool validate_event_result;
LOG_IF(FATAL,
hsa_ven_amd_aqlprofile_validate_event(_agent.get_hsa_agent(),
&_metrics.back().instances.back(),
&validate_event_result) !=
HSA_STATUS_SUCCESS);
LOG_IF(FATAL, !validate_event_result)
<< "Invalid Metric: " << block_index << " " << event_id;
_event_to_metric[std::make_tuple(
static_cast<hsa_ven_amd_aqlprofile_block_name_t>(query_info.id),
block_index,
event_id)] = x;
}
}
// Check that we can collect all of the metrics in a single execution
// with a single AQL packet
can_collect();
_events = get_all_events();
}
std::unique_ptr<hsa::AQLPacket>
AQLPacketConstruct::construct_packet(const AmdExtTable& ext) const
{
const size_t MEM_PAGE_MASK = 0x1000 - 1;
auto pkt_ptr = std::make_unique<hsa::AQLPacket>(ext.hsa_amd_memory_pool_free_fn);
auto& pkt = *pkt_ptr;
if(_events.empty())
{
throw std::runtime_error("Constructing packet with no events");
}
pkt.profile = hsa_ven_amd_aqlprofile_profile_t{
_agent.get_hsa_agent(),
HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC, // SPM?
_events.data(),
static_cast<uint32_t>(_events.size()),
nullptr,
0u,
hsa_ven_amd_aqlprofile_descriptor_t{.ptr = nullptr, .size = 0},
hsa_ven_amd_aqlprofile_descriptor_t{.ptr = nullptr, .size = 0}};
auto& profile = pkt.profile;
hsa_amd_memory_pool_access_t _access = HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED;
ext.hsa_amd_agent_memory_pool_get_info_fn(_agent.get_hsa_agent(),
_agent.kernarg_pool(),
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
static_cast<void*>(&_access));
// Memory is accessable by both the GPU and CPU, unlock the command buffer for
// sharing.
if(_access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED)
{
throw std::runtime_error(
fmt::format("Agent {} does not allow memory pool access for counter collection",
_agent.get_hsa_agent().handle));
}
auto throw_if_failed = [](auto status, auto& message) {
if(status != HSA_STATUS_SUCCESS)
{
throw std::runtime_error(message);
}
};
throw_if_failed(hsa_ven_amd_aqlprofile_start(&profile, nullptr),
"could not generate packet sizes");
if(profile.command_buffer.size == 0 || profile.output_buffer.size == 0)
{
throw std::runtime_error(
fmt::format("No command or output buffer size set. CMD_BUF={} PROFILE_BUF={}",
profile.command_buffer.size,
profile.output_buffer.size));
}
// Allocate buffers and check the results
auto alloc_and_check = [&](auto& pool, auto** mem_loc, auto size) -> bool {
bool malloced = false;
size_t page_aligned = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
if(ext.hsa_amd_memory_pool_allocate_fn(
pool, page_aligned, 0, static_cast<void**>(mem_loc)) != HSA_STATUS_SUCCESS)
{
*mem_loc = malloc(page_aligned);
malloced = true;
}
else
{
CHECK(*mem_loc);
hsa_agent_t agent = _agent.get_hsa_agent();
// Memory is accessable by both the GPU and CPU, unlock the command buffer for
// sharing.
LOG_IF(FATAL,
ext.hsa_amd_agents_allow_access_fn(1, &agent, nullptr, *mem_loc) !=
HSA_STATUS_SUCCESS)
<< "Error: Allowing access to Command Buffer";
}
return malloced;
};
// Build command and output buffers
pkt.command_buf_mallocd = alloc_and_check(
_agent.cpu_pool(), &profile.command_buffer.ptr, profile.command_buffer.size);
pkt.output_buffer_malloced = alloc_and_check(
_agent.kernarg_pool(), &profile.output_buffer.ptr, profile.output_buffer.size);
memset(profile.output_buffer.ptr, 0x0, profile.output_buffer.size);
// throw if we do not construct the packets correctly.
throw_if_failed(hsa_ven_amd_aqlprofile_start(&profile, &pkt.start),
"could not generate start packet");
throw_if_failed(hsa_ven_amd_aqlprofile_stop(&profile, &pkt.stop),
"could not generate stop packet");
throw_if_failed(hsa_ven_amd_aqlprofile_read(&profile, &pkt.read),
"could not generate read packet");
return pkt_ptr;
}
std::vector<hsa_ven_amd_aqlprofile_event_t>
AQLPacketConstruct::get_all_events() const
{
std::vector<hsa_ven_amd_aqlprofile_event_t> ret;
for(const auto& metric : _metrics)
{
ret.insert(ret.end(), metric.instances.begin(), metric.instances.end());
}
return ret;
}
const counters::Metric*
AQLPacketConstruct::event_to_metric(const hsa_ven_amd_aqlprofile_event_t& event) const
{
if(const auto* ptr = rocprofiler::common::get_val(
_event_to_metric,
std::make_tuple(event.block_name, event.block_index, event.counter_id)))
{
return ptr;
}
return nullptr;
}
void
AQLPacketConstruct::can_collect()
{
// Verify that the counters fit within harrdware limits
std::map<std::pair<hsa_ven_amd_aqlprofile_block_name_t, uint32_t>, int64_t> counter_count;
std::map<std::pair<hsa_ven_amd_aqlprofile_block_name_t, uint32_t>, int64_t> max_allowed;
for(auto& metric : _metrics)
{
for(auto& instance : metric.instances)
{
auto block_pair = std::make_pair(instance.block_name, instance.block_index);
auto [iter, inserted] = counter_count.emplace(block_pair, 0);
iter->second++;
if(inserted)
{
max_allowed.emplace(block_pair,
get_block_counters(_agent.get_hsa_agent(), instance));
}
}
}
// Check if the block count > max count
for(auto& [block_name, count] : counter_count)
{
if(auto* max = CHECK_NOTNULL(common::get_val(max_allowed, block_name)); count > *max)
{
throw std::runtime_error(
fmt::format("Block {} exceeds max number of hardware counters ({} > {})",
static_cast<int64_t>(block_name.first),
count,
*max));
}
}
}
} // namespace aql
} // namespace rocprofiler