[SWDEV-518071] Return HSA not loaded status (device counter collection) (#242)

* [SWDEV-518071] Return HSA not loaded status (device counter collection)

This is a state that a caller would want to know about to understand if
they got no counters because of a failure or if they were trying to
collect counters too early (as is the case in the sample, which can
attempt to collect counters before HSA is inited).

* Minor edit

* format

* [SWDEV-518081] Simplify Metric Loading (#243)

* [SWDEV-518071] Return HSA not loaded status (device counter collection)

This is a state that a caller would want to know about to understand if
they got no counters because of a failure or if they were trying to
collect counters too early (as is the case in the sample, which can
attempt to collect counters before HSA is inited).
* [SWDEV-518324] Add AST update support

Allows the ability for ASTs to be updated (instead of an unchangable
static value). Adds a shared pointer return type to protect against
static destructors/modifications from invalidating potentially in use
AST definitions. No functionality/use changes in this PR.
* [SWDEV-518593] Add updatable dimension cache + fix string issues (#252)

* [SWDEV-518593] Add updatable dimension cache + fix string issues

Updates dimension cache to use the same design pattern as AST/Metrics.

Fixes the string scoping issue seen in ASTs, which appears here as well.

* Add rocprofiler_create_counter

Creates derived counters based on input from the API. This PR does three
things:

1. Adds the API + test case
2. Validates that an AST can be constructed from the counter supplied.
3. Updates metrics, ast, and dimension caches to include the new metric.

Metric should be available for use immediately after the call completes.

Due to the regeneration of ASTs, this call should not be performed in
performance sensitive code.

* Suggestion fixes

---------

Co-authored-by: Benjamin Welton <bewelton@amd.com>

* Minor tweak

---------

Co-authored-by: Benjamin Welton <bewelton@amd.com>
Co-authored-by: Venkateshwar Reddy Kandula <vkandula@amd.com>

---------

Co-authored-by: Benjamin Welton <bewelton@amd.com>
Co-authored-by: Venkateshwar Reddy Kandula <vkandula@amd.com>

* Fixes for comments

---------

Co-authored-by: Benjamin Welton <bewelton@amd.com>
Co-authored-by: Kandula, Venkateshwar reddy <Venkateshwarreddy.Kandula@amd.com>
Co-authored-by: Venkateshwar Reddy Kandula <vkandula@amd.com>

---------

Co-authored-by: Benjamin Welton <bewelton@amd.com>
Co-authored-by: Kandula, Venkateshwar reddy <Venkateshwarreddy.Kandula@amd.com>
Co-authored-by: Venkateshwar Reddy Kandula <vkandula@amd.com>
This commit is contained in:
Welton, Benjamin
2025-03-14 01:07:16 -07:00
کامیت شده توسط GitHub
والد c30bb7cbda
کامیت 007285272b
25فایلهای تغییر یافته به همراه633 افزوده شده و 491 حذف شده
@@ -78,7 +78,7 @@ public:
counter_sampler(rocprofiler_agent_id_t agent);
// Decode the counter name of a record
const std::string& decode_record_name(const rocprofiler_record_counter_t& rec) const;
std::string decode_record_name(const rocprofiler_record_counter_t& rec) const;
// Get the dimensions of a record (what CU/SE/etc the counter is for). High cost operation
// should be cached if possible.
@@ -86,8 +86,8 @@ public:
const rocprofiler_record_counter_t& rec);
// Sample the counter values for a set of counters, returns the records in the out parameter.
void sample_counter_values(const std::vector<std::string>& counters,
std::vector<rocprofiler_record_counter_t>& out);
rocprofiler_status_t sample_counter_values(const std::vector<std::string>& counters,
std::vector<rocprofiler_record_counter_t>& out);
// Get the available agents on the system
static std::vector<rocprofiler_agent_v0_t> get_available_agents();
@@ -165,7 +165,7 @@ counter_sampler::counter_sampler(rocprofiler_agent_id_t agent)
"Could not setup buffered service");
}
const std::string&
std::string
counter_sampler::decode_record_name(const rocprofiler_record_counter_t& rec) const
{
if(id_to_name_.empty())
@@ -179,6 +179,11 @@ counter_sampler::decode_record_name(const rocprofiler_record_counter_t& rec) con
rocprofiler_counter_id_t counter_id = {.handle = 0};
rocprofiler_query_record_counter_id(rec.id, &counter_id);
if(id_to_name_.find(counter_id.handle) == id_to_name_.end())
{
std::clog << "Unknown counter id = " << counter_id.handle << "\n";
return "UNKNOWN";
}
return id_to_name_.at(counter_id.handle);
}
@@ -199,7 +204,7 @@ counter_sampler::get_record_dimensions(const rocprofiler_record_counter_t& rec)
return out;
}
void
rocprofiler_status_t
counter_sampler::sample_counter_values(const std::vector<std::string>& counters,
std::vector<rocprofiler_record_counter_t>& out)
{
@@ -228,15 +233,23 @@ counter_sampler::sample_counter_values(const std::vector<std::string>&
profile_sizes_.emplace(profile.handle, expected_size);
profile_cached = cached_profiles_.find(counters);
}
out.resize(profile_sizes_.at(profile_cached->second.handle));
try
{
out.resize(profile_sizes_.at(profile_cached->second.handle));
} catch(const std::exception& e)
{
std::cerr << "Caught exception: " << e.what() << "\n";
return ROCPROFILER_STATUS_ERROR;
}
profile_ = profile_cached->second;
rocprofiler_start_context(ctx_);
std::this_thread::sleep_for(std::chrono::milliseconds(50));
size_t out_size = out.size();
rocprofiler_sample_device_counting_service(
auto status = rocprofiler_sample_device_counting_service(
ctx_, {}, ROCPROFILER_COUNTER_FLAG_NONE, out.data(), &out_size);
rocprofiler_stop_context(ctx_);
out.resize(out_size);
return status;
}
std::vector<rocprofiler_agent_v0_t>
@@ -392,22 +405,31 @@ tool_init(rocprofiler_client_finalize_t fini_func, void*)
std::vector<rocprofiler_record_counter_t> records;
while(sampler && exit_toggle().load() == false)
{
sampler->sample_counter_values({"SQ_WAVES"}, records);
std::clog << "Sample " << count << ":\n";
for(const auto& record : records)
auto status = sampler->sample_counter_values({"SQ_WAVES"}, records);
if(status == ROCPROFILER_STATUS_ERROR_HSA_NOT_LOADED)
{
if(!sampler) break;
auto recname = sampler->decode_record_name(record);
std::clog << "\tCounter: " << record.id << " Name: " << recname
<< " Value: " << record.counter_value
<< " User data: " << record.user_data.value << "\n";
if(count == 1)
std::clog << "HSA not loaded yet....\n";
std::this_thread::sleep_for(std::chrono::milliseconds(50));
continue;
}
std::clog << "Sample " << count << ":\n";
if(status == ROCPROFILER_STATUS_SUCCESS)
{
for(const auto& record : records)
{
if(!sampler) break;
auto dims = sampler->get_record_dimensions(record);
for(const auto& [name, pos] : dims)
auto recname = sampler->decode_record_name(record);
std::clog << "\tCounter: " << record.id << " Name: " << recname
<< " Value: " << record.counter_value
<< " User data: " << record.user_data.value << "\n";
if(count == 1)
{
std::clog << "\t\tDimension Name: " << name << ": " << pos << "\n";
if(!sampler) break;
auto dims = sampler->get_record_dimensions(record);
for(const auto& [name, pos] : dims)
{
std::clog << "\t\tDimension Name: " << name << ": " << pos << "\n";
}
}
}
}
@@ -116,7 +116,7 @@ main(int argc, char** argv)
int ntotdevice = 0;
HIP_CALL(hipGetDeviceCount(&ntotdevice));
long nitr = 5000;
long nitr = 50000;
long nsync = 500;
long ndevice = 0;