Files
rocm-systems/samples/counter_collection/device_counting_synchronous.cpp
T
Welton, Benjamin 007285272b [SWDEV-518071] Return HSA not loaded status (device counter collection) (#242)
* [SWDEV-518071] Return HSA not loaded status (device counter collection)

This is a state that a caller would want to know about to understand if
they got no counters because of a failure or if they were trying to
collect counters too early (as is the case in the sample, which can
attempt to collect counters before HSA is inited).

* Minor edit

* format

* [SWDEV-518081] Simplify Metric Loading (#243)

* [SWDEV-518071] Return HSA not loaded status (device counter collection)

This is a state that a caller would want to know about to understand if
they got no counters because of a failure or if they were trying to
collect counters too early (as is the case in the sample, which can
attempt to collect counters before HSA is inited).
* [SWDEV-518324] Add AST update support

Allows the ability for ASTs to be updated (instead of an unchangable
static value). Adds a shared pointer return type to protect against
static destructors/modifications from invalidating potentially in use
AST definitions. No functionality/use changes in this PR.
* [SWDEV-518593] Add updatable dimension cache + fix string issues (#252)

* [SWDEV-518593] Add updatable dimension cache + fix string issues

Updates dimension cache to use the same design pattern as AST/Metrics.

Fixes the string scoping issue seen in ASTs, which appears here as well.

* Add rocprofiler_create_counter

Creates derived counters based on input from the API. This PR does three
things:

1. Adds the API + test case
2. Validates that an AST can be constructed from the counter supplied.
3. Updates metrics, ast, and dimension caches to include the new metric.

Metric should be available for use immediately after the call completes.

Due to the regeneration of ASTs, this call should not be performed in
performance sensitive code.

* Suggestion fixes

---------

Co-authored-by: Benjamin Welton <bewelton@amd.com>

* Minor tweak

---------

Co-authored-by: Benjamin Welton <bewelton@amd.com>
Co-authored-by: Venkateshwar Reddy Kandula <vkandula@amd.com>

---------

Co-authored-by: Benjamin Welton <bewelton@amd.com>
Co-authored-by: Venkateshwar Reddy Kandula <vkandula@amd.com>

* Fixes for comments

---------

Co-authored-by: Benjamin Welton <bewelton@amd.com>
Co-authored-by: Kandula, Venkateshwar reddy <Venkateshwarreddy.Kandula@amd.com>
Co-authored-by: Venkateshwar Reddy Kandula <vkandula@amd.com>

---------

Co-authored-by: Benjamin Welton <bewelton@amd.com>
Co-authored-by: Kandula, Venkateshwar reddy <Venkateshwarreddy.Kandula@amd.com>
Co-authored-by: Venkateshwar Reddy Kandula <vkandula@amd.com>
2025-03-14 01:07:16 -07:00

514 строки
20 KiB
C++

// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "client.hpp"
#include <atomic>
#include <chrono>
#include <cstdlib>
#include <fstream>
#include <functional>
#include <iostream>
#include <map>
#include <memory>
#include <mutex>
#include <set>
#include <shared_mutex>
#include <sstream>
#include <stdexcept>
#include <unordered_map>
#include <vector>
#include <rocprofiler-sdk/buffer.h>
#include <rocprofiler-sdk/context.h>
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/registration.h>
#include <rocprofiler-sdk/rocprofiler.h>
#define ROCPROFILER_CALL(result, msg) \
{ \
rocprofiler_status_t CHECKSTATUS = result; \
if(CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS) \
{ \
std::string status_msg = rocprofiler_get_status_string(CHECKSTATUS); \
std::cerr << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg \
<< " failed with error code " << CHECKSTATUS << ": " << status_msg \
<< std::endl; \
std::stringstream errmsg{}; \
errmsg << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg " failure (" \
<< status_msg << ")"; \
throw std::runtime_error(errmsg.str()); \
} \
}
int
start()
{
return 1;
}
namespace
{
// Class to sample counter values from the ROCProfiler API
// This class is not thread safe and should not be shared between threads.
// Only a single instance of this class should be created per agent.
class counter_sampler
{
public:
// Setup system profiling for an agent
counter_sampler(rocprofiler_agent_id_t agent);
// Decode the counter name of a record
std::string decode_record_name(const rocprofiler_record_counter_t& rec) const;
// Get the dimensions of a record (what CU/SE/etc the counter is for). High cost operation
// should be cached if possible.
static std::unordered_map<std::string, size_t> get_record_dimensions(
const rocprofiler_record_counter_t& rec);
// Sample the counter values for a set of counters, returns the records in the out parameter.
rocprofiler_status_t sample_counter_values(const std::vector<std::string>& counters,
std::vector<rocprofiler_record_counter_t>& out);
// Get the available agents on the system
static std::vector<rocprofiler_agent_v0_t> get_available_agents();
void flush() const { rocprofiler_flush_buffer(buf_); }
void stop() const { rocprofiler_stop_context(ctx_); }
private:
rocprofiler_agent_id_t agent_ = {};
rocprofiler_context_id_t ctx_ = {};
rocprofiler_buffer_id_t buf_ = {};
rocprofiler_profile_config_id_t profile_ = {.handle = 0};
std::map<std::vector<std::string>, rocprofiler_profile_config_id_t> cached_profiles_;
std::map<uint64_t, uint64_t> profile_sizes_;
mutable std::map<uint64_t, std::string> id_to_name_;
// Internal function used to set the profile for the agent when start_context is called
void set_profile(rocprofiler_context_id_t ctx,
rocprofiler_agent_set_profile_callback_t cb) const;
// Get the size of a counter in number of records
static size_t get_counter_size(rocprofiler_counter_id_t counter);
// Get the supported counters for an agent
static std::unordered_map<std::string, rocprofiler_counter_id_t> get_supported_counters(
rocprofiler_agent_id_t agent);
// Get the dimensions of a counter
static std::vector<rocprofiler_record_dimension_info_t> get_counter_dimensions(
rocprofiler_counter_id_t counter);
};
counter_sampler::counter_sampler(rocprofiler_agent_id_t agent)
: agent_(agent)
{
// Setup context (should only be done once per agent)
auto client_thread = rocprofiler_callback_thread_t{};
ROCPROFILER_CALL(rocprofiler_create_context(&ctx_), "context creation failed");
ROCPROFILER_CALL(rocprofiler_create_buffer(
ctx_,
4096,
2048,
ROCPROFILER_BUFFER_POLICY_LOSSLESS,
[](rocprofiler_context_id_t,
rocprofiler_buffer_id_t,
rocprofiler_record_header_t**,
size_t,
void*,
uint64_t) {},
nullptr,
&buf_),
"buffer creation failed");
ROCPROFILER_CALL(rocprofiler_create_callback_thread(&client_thread),
"failure creating callback thread");
ROCPROFILER_CALL(rocprofiler_assign_callback_thread(buf_, client_thread),
"failed to assign thread for buffer");
ROCPROFILER_CALL(rocprofiler_configure_device_counting_service(
ctx_,
buf_,
agent,
[](rocprofiler_context_id_t context_id,
rocprofiler_agent_id_t,
rocprofiler_agent_set_profile_callback_t set_config,
void* user_data) {
if(user_data)
{
auto* sampler = static_cast<counter_sampler*>(user_data);
sampler->set_profile(context_id, set_config);
}
},
this),
"Could not setup buffered service");
}
std::string
counter_sampler::decode_record_name(const rocprofiler_record_counter_t& rec) const
{
if(id_to_name_.empty())
{
auto name_to_id = counter_sampler::get_supported_counters(agent_);
for(const auto& [name, id] : name_to_id)
{
id_to_name_.emplace(id.handle, name);
}
}
rocprofiler_counter_id_t counter_id = {.handle = 0};
rocprofiler_query_record_counter_id(rec.id, &counter_id);
if(id_to_name_.find(counter_id.handle) == id_to_name_.end())
{
std::clog << "Unknown counter id = " << counter_id.handle << "\n";
return "UNKNOWN";
}
return id_to_name_.at(counter_id.handle);
}
std::unordered_map<std::string, size_t>
counter_sampler::get_record_dimensions(const rocprofiler_record_counter_t& rec)
{
std::unordered_map<std::string, size_t> out;
rocprofiler_counter_id_t counter_id = {.handle = 0};
rocprofiler_query_record_counter_id(rec.id, &counter_id);
auto dims = get_counter_dimensions(counter_id);
for(auto& dim : dims)
{
size_t pos = 0;
rocprofiler_query_record_dimension_position(rec.id, dim.id, &pos);
out.emplace(dim.name, pos);
}
return out;
}
rocprofiler_status_t
counter_sampler::sample_counter_values(const std::vector<std::string>& counters,
std::vector<rocprofiler_record_counter_t>& out)
{
auto profile_cached = cached_profiles_.find(counters);
if(profile_cached == cached_profiles_.end())
{
size_t expected_size = 0;
rocprofiler_profile_config_id_t profile = {};
std::vector<rocprofiler_counter_id_t> gpu_counters;
auto roc_counters = get_supported_counters(agent_);
for(const auto& counter : counters)
{
auto it = roc_counters.find(counter);
if(it == roc_counters.end())
{
std::cerr << "Counter " << counter << " not found\n";
continue;
}
gpu_counters.push_back(it->second);
expected_size += get_counter_size(it->second);
}
ROCPROFILER_CALL(rocprofiler_create_profile_config(
agent_, gpu_counters.data(), gpu_counters.size(), &profile),
"Could not create profile");
cached_profiles_.emplace(counters, profile);
profile_sizes_.emplace(profile.handle, expected_size);
profile_cached = cached_profiles_.find(counters);
}
try
{
out.resize(profile_sizes_.at(profile_cached->second.handle));
} catch(const std::exception& e)
{
std::cerr << "Caught exception: " << e.what() << "\n";
return ROCPROFILER_STATUS_ERROR;
}
profile_ = profile_cached->second;
rocprofiler_start_context(ctx_);
std::this_thread::sleep_for(std::chrono::milliseconds(50));
size_t out_size = out.size();
auto status = rocprofiler_sample_device_counting_service(
ctx_, {}, ROCPROFILER_COUNTER_FLAG_NONE, out.data(), &out_size);
rocprofiler_stop_context(ctx_);
out.resize(out_size);
return status;
}
std::vector<rocprofiler_agent_v0_t>
counter_sampler::get_available_agents()
{
std::vector<rocprofiler_agent_v0_t> agents;
rocprofiler_query_available_agents_cb_t iterate_cb = [](rocprofiler_agent_version_t agents_ver,
const void** agents_arr,
size_t num_agents,
void* udata) {
if(agents_ver != ROCPROFILER_AGENT_INFO_VERSION_0)
throw std::runtime_error{"unexpected rocprofiler agent version"};
auto* agents_v = static_cast<std::vector<rocprofiler_agent_v0_t>*>(udata);
for(size_t i = 0; i < num_agents; ++i)
{
const auto* rocp_agent = static_cast<const rocprofiler_agent_v0_t*>(agents_arr[i]);
if(rocp_agent->type == ROCPROFILER_AGENT_TYPE_GPU) agents_v->emplace_back(*rocp_agent);
}
return ROCPROFILER_STATUS_SUCCESS;
};
ROCPROFILER_CALL(
rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0,
iterate_cb,
sizeof(rocprofiler_agent_t),
const_cast<void*>(static_cast<const void*>(&agents))),
"query available agents");
return agents;
}
void
counter_sampler::set_profile(rocprofiler_context_id_t ctx,
rocprofiler_agent_set_profile_callback_t cb) const
{
if(profile_.handle != 0)
{
cb(ctx, profile_);
}
}
size_t
counter_sampler::get_counter_size(rocprofiler_counter_id_t counter)
{
size_t size = 1;
rocprofiler_iterate_counter_dimensions(
counter,
[](rocprofiler_counter_id_t,
const rocprofiler_record_dimension_info_t* dim_info,
size_t num_dims,
void* user_data) {
size_t* s = static_cast<size_t*>(user_data);
for(size_t i = 0; i < num_dims; i++)
{
*s *= dim_info[i].instance_size;
}
return ROCPROFILER_STATUS_SUCCESS;
},
static_cast<void*>(&size));
return size;
}
std::unordered_map<std::string, rocprofiler_counter_id_t>
counter_sampler::get_supported_counters(rocprofiler_agent_id_t agent)
{
std::unordered_map<std::string, rocprofiler_counter_id_t> out;
std::vector<rocprofiler_counter_id_t> gpu_counters;
ROCPROFILER_CALL(rocprofiler_iterate_agent_supported_counters(
agent,
[](rocprofiler_agent_id_t,
rocprofiler_counter_id_t* counters,
size_t num_counters,
void* user_data) {
std::vector<rocprofiler_counter_id_t>* vec =
static_cast<std::vector<rocprofiler_counter_id_t>*>(user_data);
for(size_t i = 0; i < num_counters; i++)
{
vec->push_back(counters[i]);
}
return ROCPROFILER_STATUS_SUCCESS;
},
static_cast<void*>(&gpu_counters)),
"Could not fetch supported counters");
for(auto& counter : gpu_counters)
{
rocprofiler_counter_info_v0_t version;
ROCPROFILER_CALL(
rocprofiler_query_counter_info(
counter, ROCPROFILER_COUNTER_INFO_VERSION_0, static_cast<void*>(&version)),
"Could not query info for counter");
out.emplace(version.name, counter);
}
return out;
}
std::vector<rocprofiler_record_dimension_info_t>
counter_sampler::get_counter_dimensions(rocprofiler_counter_id_t counter)
{
std::vector<rocprofiler_record_dimension_info_t> dims;
rocprofiler_available_dimensions_cb_t cb =
[](rocprofiler_counter_id_t,
const rocprofiler_record_dimension_info_t* dim_info,
size_t num_dims,
void* user_data) {
std::vector<rocprofiler_record_dimension_info_t>* vec =
static_cast<std::vector<rocprofiler_record_dimension_info_t>*>(user_data);
for(size_t i = 0; i < num_dims; i++)
{
vec->push_back(dim_info[i]);
}
return ROCPROFILER_STATUS_SUCCESS;
};
ROCPROFILER_CALL(rocprofiler_iterate_counter_dimensions(counter, cb, &dims),
"Could not iterate counter dimensions");
return dims;
}
std::atomic<bool>&
exit_toggle()
{
static std::atomic<bool> exit_toggle = false;
return exit_toggle;
}
rocprofiler_client_finalize_t finalize = nullptr;
rocprofiler_client_id_t* client_id = nullptr;
std::shared_ptr<counter_sampler> sampler = {};
std::thread* sampler_thread = nullptr;
} // namespace
int
tool_init(rocprofiler_client_finalize_t fini_func, void*)
{
finalize = fini_func;
std::atexit([]() {
if(client_id) finalize(*client_id);
});
// Get the agents available on the device
auto agents = counter_sampler::get_available_agents();
if(agents.empty())
{
std::cerr << "No agents found\n";
return -1;
}
// Use the first agent found
sampler = std::make_shared<counter_sampler>(agents[0].id);
sampler_thread = new std::thread{[=]() {
size_t count = 1;
std::vector<rocprofiler_record_counter_t> records;
while(sampler && exit_toggle().load() == false)
{
auto status = sampler->sample_counter_values({"SQ_WAVES"}, records);
if(status == ROCPROFILER_STATUS_ERROR_HSA_NOT_LOADED)
{
std::clog << "HSA not loaded yet....\n";
std::this_thread::sleep_for(std::chrono::milliseconds(50));
continue;
}
std::clog << "Sample " << count << ":\n";
if(status == ROCPROFILER_STATUS_SUCCESS)
{
for(const auto& record : records)
{
if(!sampler) break;
auto recname = sampler->decode_record_name(record);
std::clog << "\tCounter: " << record.id << " Name: " << recname
<< " Value: " << record.counter_value
<< " User data: " << record.user_data.value << "\n";
if(count == 1)
{
if(!sampler) break;
auto dims = sampler->get_record_dimensions(record);
for(const auto& [name, pos] : dims)
{
std::clog << "\t\tDimension Name: " << name << ": " << pos << "\n";
}
}
}
}
count++;
std::this_thread::sleep_for(std::chrono::milliseconds(50));
}
exit_toggle().store(false);
}};
// no errors
return 0;
}
void
tool_fini(void* user_data)
{
std::clog << "In tool fini\n" << std::flush;
client_id = nullptr;
exit_toggle().store(true);
while(exit_toggle().load() == true)
{};
sampler->stop();
sampler->flush();
sampler_thread->join();
auto* output_stream = static_cast<std::ostream*>(user_data);
*output_stream << std::flush;
if(output_stream != &std::cout && output_stream != &std::cerr) delete output_stream;
sampler.reset();
delete sampler_thread;
std::clog << "Completed tool fini\n" << std::flush;
}
extern "C" rocprofiler_tool_configure_result_t*
rocprofiler_configure(uint32_t version,
const char* runtime_version,
uint32_t priority,
rocprofiler_client_id_t* id)
{
// set the client name
id->name = "CounterClientSample";
client_id = id;
// compute major/minor/patch version info
uint32_t major = version / 10000;
uint32_t minor = (version % 10000) / 100;
uint32_t patch = version % 100;
// generate info string
auto info = std::stringstream{};
info << id->name << " (priority=" << priority << ") is using rocprofiler-sdk v" << major << "."
<< minor << "." << patch << " (" << runtime_version << ")";
std::clog << info.str() << std::endl;
std::ostream* output_stream = nullptr;
std::string filename = "counter_collection.log";
if(auto* outfile = getenv("ROCPROFILER_SAMPLE_OUTPUT_FILE"); outfile) filename = outfile;
if(filename == "stdout")
output_stream = &std::cout;
else if(filename == "stderr")
output_stream = &std::cerr;
else
output_stream = new std::ofstream{filename};
// create configure data
static auto cfg =
rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t),
&tool_init,
&tool_fini,
static_cast<void*>(output_stream)};
// return pointer to configure data
return &cfg;
}