[AFAR VII] rocprofiler_sample_device_counting_service return data as part of API call (#57)

---------

Co-authored-by: Benjamin Welton <bewelton@amd.com>
Co-authored-by: Benjamin Welton <ben@amd.com>

[ROCm/rocprofiler-sdk commit: 253c9adfc1]
This commit is contained in:
Welton, Benjamin
2024-12-06 22:37:45 -08:00
committed by GitHub
szülő 22b4e6739d
commit 1850de7ee1
8 fájl változott, egészen pontosan 140 új sor hozzáadva és 54 régi sor törölve
@@ -122,6 +122,7 @@ Full documentation for ROCprofiler-SDK is available at [rocm.docs.amd.com/projec
- Changed naming of "dispatch profiling service" to a more descriptive "dispatch counting service". To convert existing tool or user code to the new names, the following sed can be used: `-type f -exec sed -i -e 's/dispatch_profile_counting_service/dispatch_counting_service/g' -e 's/dispatch_profile.h/dispatch_counting_service.h/g' -e 's/rocprofiler_profile_counting_dispatch_callback_t/rocprofiler_dispatch_counting_service_callback_t/g' -e 's/rocprofiler_profile_counting_dispatch_data_t/rocprofiler_dispatch_counting_service_data_t/g' -e 's/rocprofiler_profile_counting_dispatch_record_t/rocprofiler_dispatch_counting_service_record_t/g' {} +`
- `FETCH_SIZE` metric on gfx94x now uses `TCC_BUBBLE` for 128B reads.
- PMC dispatch-based counter collection serialization is now per-device instead of being global across all devices.
- Added output return functionality to rocprofiler_sample_device_counting_service
- Added rocprofiler_load_counter_definition.
### Resolved issues
@@ -358,7 +358,7 @@ tool_init(rocprofiler_client_finalize_t, void* user_data)
// below to select the profile config to use when a kernel dispatch is
// recieved.
get_profile_cache().emplace(
agent.id.handle, build_profile_for_agent(agent.id, std::set<std::string>{"SQ_WAVES"}));
agent.id.handle, build_profile_for_agent(agent.id, std::set<std::string>{"TCC_HIT"}));
}
auto client_thread = rocprofiler_callback_thread_t{};
@@ -289,8 +289,11 @@ tool_init(rocprofiler_client_finalize_t, void* user_data)
rocprofiler_start_context(get_client_ctx());
while(exit_toggle().load() == false)
{
rocprofiler_sample_device_counting_service(
get_client_ctx(), {.value = count}, ROCPROFILER_COUNTER_FLAG_NONE);
rocprofiler_sample_device_counting_service(get_client_ctx(),
{.value = count},
ROCPROFILER_COUNTER_FLAG_NONE,
nullptr,
nullptr);
count++;
std::this_thread::sleep_for(std::chrono::milliseconds(50));
}
@@ -106,18 +106,28 @@ rocprofiler_configure_device_counting_service(rocprofiler_context_id_t context_i
* @param [in] context_id context id
* @param [in] user_data User supplied data, included in records outputted to buffer.
* @param [in] flags Flags to specify how the counter data should be collected (defaults to sync).
* @param [in/out] output_records Output records collected via sampling (output is also written to
* buffer). Must be allocated by caller.
* @param [in/out] rec_count On entry, this is the maximum number of records rocprof can store in
* output_records. On exit, contains the number of actual records.
* @return ::rocprofiler_status_t
* @retval ::ROCPROFILER_STATUS_ERROR_CONTEXT_INVALID Returned if the context does not exist or
* the context is not configured for agent profiling.
* @retval ::ROCPROFILER_STATUS_ERROR_CONTEXT_ERROR Returned if another operation is in progress (
* start/stop ctx or another read).
* @retval ::ROCPROFILER_STATUS_ERROR Returned if HSA has not been initialized yet.
* @retval ::ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES Returned output_records is set but size is
* too small to store results
* @retval ::ROCPROFILER_STATUS_SUCCESS Returned if read request was successful.
* @retval ::ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT Returned If ASYNC is being used while
* output_records is not null.
*/
rocprofiler_status_t
rocprofiler_sample_device_counting_service(rocprofiler_context_id_t context_id,
rocprofiler_user_data_t user_data,
rocprofiler_counter_flag_t flags) ROCPROFILER_API;
rocprofiler_sample_device_counting_service(rocprofiler_context_id_t context_id,
rocprofiler_user_data_t user_data,
rocprofiler_counter_flag_t flags,
rocprofiler_record_counter_t* output_records,
size_t* rec_count) ROCPROFILER_API;
/** @} */
@@ -156,6 +156,10 @@ agent_async_handler(hsa_signal_value_t /*signal_v*/, void* data)
{
val.user_data = callback_data.user_data;
val.agent_id = prof_config->agent->id;
if(callback_data.cached_counters)
{
callback_data.cached_counters->push_back(val);
}
buf->emplace(
ROCPROFILER_BUFFER_CATEGORY_COUNTERS, ROCPROFILER_COUNTER_RECORD_VALUE, val);
}
@@ -253,9 +257,10 @@ init_callback_data(rocprofiler::counters::agent_callback_data& callback_data,
* and trigger the async handler manually.
*/
rocprofiler_status_t
read_agent_ctx(const context::context* ctx,
rocprofiler_user_data_t user_data,
rocprofiler_counter_flag_t flags)
read_agent_ctx(const context::context* ctx,
rocprofiler_user_data_t user_data,
rocprofiler_counter_flag_t flags,
std::vector<rocprofiler_record_counter_t>* out_counters)
{
rocprofiler_status_t status = ROCPROFILER_STATUS_SUCCESS;
if(!ctx->device_counter_collection)
@@ -282,6 +287,18 @@ read_agent_ctx(const context::context* ctx,
for(auto& callback_data : agent_ctx.agent_data)
{
auto wait_if_sync = [&]() {
if((flags & ROCPROFILER_COUNTER_FLAG_ASYNC) == 0)
{
// Wait for any inprogress samples to complete before returning
hsa::get_core_table()->hsa_signal_wait_relaxed_fn(callback_data.completion,
HSA_SIGNAL_CONDITION_EQ,
1,
UINT64_MAX,
HSA_WAIT_STATE_ACTIVE);
}
};
if(!callback_data.profile || !callback_data.set_profile) continue;
const auto* agent = agent::get_agent_cache(callback_data.profile->agent);
@@ -295,6 +312,11 @@ read_agent_ctx(const context::context* ctx,
// No AQL packet, nothing to do here.
if(!callback_data.packet) continue;
wait_if_sync();
if((flags & ROCPROFILER_COUNTER_FLAG_ASYNC) == 0)
callback_data.cached_counters = out_counters;
// If we have no hardware counters but a packet. The caller is expecting
// non-hardware based counter values to be returned. We can skip packet injection
// and trigger the async handler directly
@@ -302,16 +324,7 @@ read_agent_ctx(const context::context* ctx,
{
callback_data.user_data = user_data;
hsa::get_core_table()->hsa_signal_store_relaxed_fn(callback_data.completion, -1);
// Wait for the barrier/read packet to complete
if(flags != ROCPROFILER_COUNTER_FLAG_ASYNC)
{
// Wait for any inprogress samples to complete before returning
hsa::get_core_table()->hsa_signal_wait_relaxed_fn(callback_data.completion,
HSA_SIGNAL_CONDITION_EQ,
1,
UINT64_MAX,
HSA_WAIT_STATE_ACTIVE);
}
wait_if_sync();
continue;
}
@@ -334,17 +347,8 @@ read_agent_ctx(const context::context* ctx,
hsa::get_core_table()->hsa_signal_store_relaxed_fn(callback_data.completion, 0);
callback_data.user_data = user_data;
submitPacket(agent->profile_queue(), &barrier.barrier_and);
// Wait for the barrier/read packet to complete
if(flags != ROCPROFILER_COUNTER_FLAG_ASYNC)
{
// Wait for any inprogress samples to complete before returning
hsa::get_core_table()->hsa_signal_wait_relaxed_fn(callback_data.completion,
HSA_SIGNAL_CONDITION_EQ,
1,
UINT64_MAX,
HSA_WAIT_STATE_ACTIVE);
}
wait_if_sync();
if((flags & ROCPROFILER_COUNTER_FLAG_ASYNC) == 0) callback_data.cached_counters = nullptr;
}
agent_ctx.status.exchange(rocprofiler::context::device_counting_service::state::ENABLED);
@@ -27,6 +27,7 @@
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/hsa.h>
#include <rocprofiler-sdk/rocprofiler.h>
#include <cstddef>
namespace rocprofiler
{
@@ -55,11 +56,12 @@ struct agent_callback_data
rocprofiler_user_data_t user_data = {.value = 0};
rocprofiler_user_data_t callback_data = {.value = 0};
std::shared_ptr<rocprofiler::counters::profile_config> profile = {};
rocprofiler_agent_id_t agent_id = {.handle = 0};
rocprofiler_device_counting_service_callback_t cb = nullptr;
rocprofiler_buffer_id_t buffer = {.handle = 0};
bool set_profile = false;
std::shared_ptr<rocprofiler::counters::profile_config> profile = {};
rocprofiler_agent_id_t agent_id = {.handle = 0};
rocprofiler_device_counting_service_callback_t cb = nullptr;
rocprofiler_buffer_id_t buffer = {.handle = 0};
bool set_profile = false;
std::vector<rocprofiler_record_counter_t>* cached_counters = nullptr;
agent_callback_data() = default;
agent_callback_data(agent_callback_data&& rhs) noexcept
@@ -115,9 +117,10 @@ stop_agent_ctx(const context::context* ctx);
// read calls are not allowed in ASYNC mode and will result in
// this call waiting for the previous sample to complete.
rocprofiler_status_t
read_agent_ctx(const context::context* ctx,
rocprofiler_user_data_t user_data,
rocprofiler_counter_flag_t flags);
read_agent_ctx(const context::context* ctx,
rocprofiler_user_data_t user_data,
rocprofiler_counter_flag_t flags,
std::vector<rocprofiler_record_counter_t>* out_counters);
uint64_t
submitPacket(hsa_queue_t* queue, const void* packet);
@@ -108,10 +108,10 @@ test_init()
hsa::get_queue_controller()->init(get_api_table(), get_ext_table());
}
std::vector<rocprofiler_record_counter_t>&
common::Synchronized<std::vector<rocprofiler_record_counter_t>>&
global_recs()
{
static std::vector<rocprofiler_record_counter_t> recs;
static common::Synchronized<std::vector<rocprofiler_record_counter_t>> recs;
return recs;
}
@@ -146,7 +146,7 @@ check_output_created(rocprofiler_context_id_t,
}
found_value = record->user_data.value;
// ROCP_ERROR << fmt::format("Found counter value: {}", record->counter_value);
global_recs().push_back(*record);
global_recs().wlock([&](auto& data) { data.push_back(*record); });
}
}
@@ -319,6 +319,7 @@ protected:
size_t track_metric = 0;
for(auto& metric : metrics)
{
std::vector<rocprofiler_record_counter_t> output_records(10000);
// global_recs().clear();
track_metric++;
ROCP_ERROR << "Testing metric " << metric.name();
@@ -402,9 +403,23 @@ protected:
HSA_WAIT_STATE_BLOCKED);
// Sample the counting service.
ROCPROFILER_CALL(
rocprofiler_sample_device_counting_service(ctx, {.value = track_metric}, flags),
"Could not sample");
if(flags == ROCPROFILER_COUNTER_FLAG_ASYNC)
{
ROCPROFILER_CALL(rocprofiler_sample_device_counting_service(
ctx, {.value = track_metric}, flags, nullptr, nullptr),
"Could not sample");
}
else
{
global_recs().wlock([&](auto& _data) { _data.clear(); });
size_t out_count = output_records.size();
ROCPROFILER_CALL(
rocprofiler_sample_device_counting_service(
ctx, {.value = track_metric}, flags, output_records.data(), &out_count),
"Could not sample");
output_records.resize(out_count);
}
ROCPROFILER_CALL(rocprofiler_stop_context(ctx), "Could not stop context");
rocprofiler_flush_buffer(opt_buff_id);
@@ -417,6 +432,27 @@ protected:
{
ROCP_FATAL << "Failed to get data for " << metric.name();
}
else if(flags != ROCPROFILER_COUNTER_FLAG_ASYNC)
{
auto recs_local = global_recs().rlock([](const auto& data) { return data; });
if(recs_local.size() != output_records.size())
{
ROCP_FATAL << "Output size does not match: " << recs_local.size() << " "
<< output_records.size();
}
if(!std::equal(recs_local.begin(),
recs_local.end(),
output_records.begin(),
[](const auto& a, const auto& b) {
return a.id == b.id && a.counter_value == b.counter_value &&
a.dispatch_id == b.dispatch_id &&
a.agent_id.handle == b.agent_id.handle;
}))
{
ROCP_FATAL << "Output does not match between buffer and callback";
}
}
}
hsa_signal_destroy(completion_signal);
hsa_signal_destroy(found_data);
@@ -599,9 +635,10 @@ TEST_F(device_counting_service_test, async_counters) { test_run(ROCPROFILER_COUN
TEST_F(device_counting_service_test, sync_grbm_verify)
{
test_run(ROCPROFILER_COUNTER_FLAG_NONE, {"GRBM_COUNT"}, 50000);
ROCP_ERROR << global_recs().size();
auto local_recs = global_recs().rlock([](const auto& data) { return data; });
ROCP_ERROR << local_recs.size();
for(const auto& val : global_recs())
for(const auto& val : local_recs)
{
rocprofiler_counter_id_t id;
rocprofiler_query_record_counter_id(val.id, &id);
@@ -615,9 +652,10 @@ TEST_F(device_counting_service_test, sync_grbm_verify)
TEST_F(device_counting_service_test, sync_gpu_util_verify)
{
test_run(ROCPROFILER_COUNTER_FLAG_NONE, {"GPU_UTIL"}, 50000);
ROCP_ERROR << global_recs().size();
auto local_recs = global_recs().rlock([](const auto& data) { return data; });
ROCP_ERROR << local_recs.size();
for(const auto& val : global_recs())
for(const auto& val : local_recs)
{
rocprofiler_counter_id_t id;
rocprofiler_query_record_counter_id(val.id, &id);
@@ -631,9 +669,10 @@ TEST_F(device_counting_service_test, sync_gpu_util_verify)
TEST_F(device_counting_service_test, sync_sq_waves_verify)
{
test_run(ROCPROFILER_COUNTER_FLAG_NONE, {"SQ_WAVES_sum"}, 50000);
ROCP_ERROR << global_recs().size();
auto local_recs = global_recs().rlock([](const auto& data) { return data; });
ROCP_ERROR << local_recs.size();
for(const auto& val : global_recs())
for(const auto& val : local_recs)
{
rocprofiler_counter_id_t id;
rocprofiler_query_record_counter_id(val.id, &id);
@@ -27,6 +27,8 @@
#include "lib/rocprofiler-sdk/counters/device_counting.hpp"
#include "rocprofiler-sdk/fwd.h"
#include <string.h>
extern "C" {
rocprofiler_status_t
rocprofiler_configure_device_counting_service(rocprofiler_context_id_t context_id,
@@ -40,11 +42,35 @@ rocprofiler_configure_device_counting_service(rocprofiler_context_id_t context_i
}
rocprofiler_status_t
rocprofiler_sample_device_counting_service(rocprofiler_context_id_t context_id,
rocprofiler_user_data_t user_data,
rocprofiler_counter_flag_t flags)
rocprofiler_sample_device_counting_service(rocprofiler_context_id_t context_id,
rocprofiler_user_data_t user_data,
rocprofiler_counter_flag_t flags,
rocprofiler_record_counter_t* output_records,
size_t* rec_count)
{
if(output_records != nullptr)
{
if((flags & ROCPROFILER_COUNTER_FLAG_ASYNC) != 0)
return ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT;
CHECK(rec_count);
auto recs = std::vector<rocprofiler_record_counter_t>{};
auto status = rocprofiler::counters::read_agent_ctx(
rocprofiler::context::get_registered_context(context_id), user_data, flags, &recs);
if(status == ROCPROFILER_STATUS_SUCCESS)
{
if(recs.size() > *rec_count)
{
*rec_count = recs.size();
return ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES;
}
*rec_count = recs.size();
std::memcpy(
output_records, recs.data(), sizeof(rocprofiler_record_counter_t) * recs.size());
}
return status;
}
return rocprofiler::counters::read_agent_ctx(
rocprofiler::context::get_registered_context(context_id), user_data, flags);
rocprofiler::context::get_registered_context(context_id), user_data, flags, nullptr);
}
}