#include "lib/rocprofiler/aql/packet_construct.hpp" #include #include #include "glog/logging.h" namespace rocprofiler { namespace aql { AQLPacketConstruct::AQLPacketConstruct(const hsa::AgentCache& agent, const std::vector& metrics) : _agent(agent) { if(metrics.empty()) { throw std::runtime_error("No metrics supplied"); } // Validate that the counter exists and construct the block instances // for the counter. for(const auto& x : metrics) { auto query_info = get_query_info(_agent.get_hsa_agent(), x); _metrics.emplace_back().metric = x; uint32_t event_id = std::atoi(x.event().c_str()); for(unsigned block_index = 0; block_index < query_info.instance_count; ++block_index) { _metrics.back().instances.push_back( {static_cast(query_info.id), block_index, event_id}); bool validate_event_result; LOG_IF(FATAL, hsa_ven_amd_aqlprofile_validate_event(_agent.get_hsa_agent(), &_metrics.back().instances.back(), &validate_event_result) != HSA_STATUS_SUCCESS); LOG_IF(FATAL, !validate_event_result) << "Invalid Metric: " << block_index << " " << event_id; } } // Check that we can collect all of the metrics in a single execution // with a single AQL packet can_collect(); _events = get_all_events(); } std::unique_ptr AQLPacketConstruct::construct_packet(const AmdExtTable& ext) const { const size_t MEM_PAGE_MASK = 0x1000 - 1; auto pkt_ptr = std::make_unique(ext.hsa_amd_memory_pool_free_fn); auto& pkt = *pkt_ptr; if(_events.empty()) { throw std::runtime_error("Constructing packet with no events"); } pkt.profile = hsa_ven_amd_aqlprofile_profile_t{ _agent.get_hsa_agent(), HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC, // SPM? _events.data(), static_cast(_events.size()), nullptr, 0u, hsa_ven_amd_aqlprofile_descriptor_t{.ptr = nullptr, .size = 0}, hsa_ven_amd_aqlprofile_descriptor_t{.ptr = nullptr, .size = 0}}; auto& profile = pkt.profile; hsa_amd_memory_pool_access_t _access = HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED; ext.hsa_amd_agent_memory_pool_get_info_fn(_agent.get_hsa_agent(), _agent.kernarg_pool(), HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, static_cast(&_access)); // Memory is accessable by both the GPU and CPU, unlock the command buffer for // sharing. if(_access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) { throw std::runtime_error( fmt::format("Agent {} does not allow memory pool access for counter collection", _agent.get_hsa_agent().handle)); } auto throw_if_failed = [](auto status, auto& message) { if(status != HSA_STATUS_SUCCESS) { throw std::runtime_error(message); } }; throw_if_failed(hsa_ven_amd_aqlprofile_start(&profile, nullptr), "could not generate packet sizes"); if(profile.command_buffer.size == 0 || profile.output_buffer.size == 0) { throw std::runtime_error( fmt::format("No command or output buffer size set. CMD_BUF={} PROFILE_BUF={}", profile.command_buffer.size, profile.output_buffer.size)); } // Allocate buffers and check the results auto alloc_and_check = [&](auto& pool, auto** mem_loc, auto size) -> bool { bool malloced = false; size_t page_aligned = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; if(ext.hsa_amd_memory_pool_allocate_fn( pool, page_aligned, 0, static_cast(mem_loc)) != HSA_STATUS_SUCCESS) { *mem_loc = malloc(page_aligned); malloced = true; } else { CHECK(*mem_loc); hsa_agent_t agent = _agent.get_hsa_agent(); // Memory is accessable by both the GPU and CPU, unlock the command buffer for // sharing. LOG_IF(FATAL, ext.hsa_amd_agents_allow_access_fn(1, &agent, nullptr, *mem_loc) != HSA_STATUS_SUCCESS) << "Error: Allowing access to Command Buffer"; } return malloced; }; // Build command and output buffers pkt.command_buf_mallocd = alloc_and_check( _agent.cpu_pool(), &profile.command_buffer.ptr, profile.command_buffer.size); pkt.output_buffer_malloced = alloc_and_check( _agent.kernarg_pool(), &profile.output_buffer.ptr, profile.output_buffer.size); memset(profile.output_buffer.ptr, 0x0, profile.output_buffer.size); // throw if we do not construct the packets correctly. throw_if_failed(hsa_ven_amd_aqlprofile_start(&profile, &pkt.start), "could not generate start packet"); throw_if_failed(hsa_ven_amd_aqlprofile_stop(&profile, &pkt.stop), "could not generate stop packet"); throw_if_failed(hsa_ven_amd_aqlprofile_read(&profile, &pkt.read), "could not generate read packet"); return pkt_ptr; } std::vector AQLPacketConstruct::get_all_events() const { std::vector ret; for(const auto& metric : _metrics) { ret.insert(ret.end(), metric.instances.begin(), metric.instances.end()); } return ret; } void AQLPacketConstruct::can_collect() { // Verify that the counters fit within harrdware limits std::map, int64_t> counter_count; std::map, int64_t> max_allowed; for(auto& metric : _metrics) { for(auto& instance : metric.instances) { auto block_pair = std::make_pair(instance.block_name, instance.block_index); auto [iter, inserted] = counter_count.emplace(block_pair, 0); iter->second++; if(inserted) { max_allowed.emplace(block_pair, get_block_counters(_agent.get_hsa_agent(), instance)); } } } // Check if the block count > max count for(auto& [block_name, count] : counter_count) { if(auto* max = CHECK_NOTNULL(common::get_val(max_allowed, block_name)); count > *max) { throw std::runtime_error( fmt::format("Block {} exceeds max number of hardware counters ({} > {})", static_cast(block_name.first), count, *max)); } } } } // namespace aql } // namespace rocprofiler