[AFAR VII] rocprofiler_sample_device_counting_service return data as part of API call (#57)
---------
Co-authored-by: Benjamin Welton <bewelton@amd.com>
Co-authored-by: Benjamin Welton <ben@amd.com>
[ROCm/rocprofiler-sdk commit: 253c9adfc1]
This commit is contained in:
@@ -122,6 +122,7 @@ Full documentation for ROCprofiler-SDK is available at [rocm.docs.amd.com/projec
|
||||
- Changed naming of "dispatch profiling service" to a more descriptive "dispatch counting service". To convert existing tool or user code to the new names, the following sed can be used: `-type f -exec sed -i -e 's/dispatch_profile_counting_service/dispatch_counting_service/g' -e 's/dispatch_profile.h/dispatch_counting_service.h/g' -e 's/rocprofiler_profile_counting_dispatch_callback_t/rocprofiler_dispatch_counting_service_callback_t/g' -e 's/rocprofiler_profile_counting_dispatch_data_t/rocprofiler_dispatch_counting_service_data_t/g' -e 's/rocprofiler_profile_counting_dispatch_record_t/rocprofiler_dispatch_counting_service_record_t/g' {} +`
|
||||
- `FETCH_SIZE` metric on gfx94x now uses `TCC_BUBBLE` for 128B reads.
|
||||
- PMC dispatch-based counter collection serialization is now per-device instead of being global across all devices.
|
||||
- Added output return functionality to rocprofiler_sample_device_counting_service
|
||||
- Added rocprofiler_load_counter_definition.
|
||||
|
||||
### Resolved issues
|
||||
|
||||
@@ -358,7 +358,7 @@ tool_init(rocprofiler_client_finalize_t, void* user_data)
|
||||
// below to select the profile config to use when a kernel dispatch is
|
||||
// recieved.
|
||||
get_profile_cache().emplace(
|
||||
agent.id.handle, build_profile_for_agent(agent.id, std::set<std::string>{"SQ_WAVES"}));
|
||||
agent.id.handle, build_profile_for_agent(agent.id, std::set<std::string>{"TCC_HIT"}));
|
||||
}
|
||||
|
||||
auto client_thread = rocprofiler_callback_thread_t{};
|
||||
|
||||
@@ -289,8 +289,11 @@ tool_init(rocprofiler_client_finalize_t, void* user_data)
|
||||
rocprofiler_start_context(get_client_ctx());
|
||||
while(exit_toggle().load() == false)
|
||||
{
|
||||
rocprofiler_sample_device_counting_service(
|
||||
get_client_ctx(), {.value = count}, ROCPROFILER_COUNTER_FLAG_NONE);
|
||||
rocprofiler_sample_device_counting_service(get_client_ctx(),
|
||||
{.value = count},
|
||||
ROCPROFILER_COUNTER_FLAG_NONE,
|
||||
nullptr,
|
||||
nullptr);
|
||||
count++;
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(50));
|
||||
}
|
||||
|
||||
+13
-3
@@ -106,18 +106,28 @@ rocprofiler_configure_device_counting_service(rocprofiler_context_id_t context_i
|
||||
* @param [in] context_id context id
|
||||
* @param [in] user_data User supplied data, included in records outputted to buffer.
|
||||
* @param [in] flags Flags to specify how the counter data should be collected (defaults to sync).
|
||||
* @param [in/out] output_records Output records collected via sampling (output is also written to
|
||||
* buffer). Must be allocated by caller.
|
||||
* @param [in/out] rec_count On entry, this is the maximum number of records rocprof can store in
|
||||
* output_records. On exit, contains the number of actual records.
|
||||
* @return ::rocprofiler_status_t
|
||||
* @retval ::ROCPROFILER_STATUS_ERROR_CONTEXT_INVALID Returned if the context does not exist or
|
||||
* the context is not configured for agent profiling.
|
||||
* @retval ::ROCPROFILER_STATUS_ERROR_CONTEXT_ERROR Returned if another operation is in progress (
|
||||
* start/stop ctx or another read).
|
||||
* @retval ::ROCPROFILER_STATUS_ERROR Returned if HSA has not been initialized yet.
|
||||
* @retval ::ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES Returned output_records is set but size is
|
||||
* too small to store results
|
||||
* @retval ::ROCPROFILER_STATUS_SUCCESS Returned if read request was successful.
|
||||
* @retval ::ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT Returned If ASYNC is being used while
|
||||
* output_records is not null.
|
||||
*/
|
||||
rocprofiler_status_t
|
||||
rocprofiler_sample_device_counting_service(rocprofiler_context_id_t context_id,
|
||||
rocprofiler_user_data_t user_data,
|
||||
rocprofiler_counter_flag_t flags) ROCPROFILER_API;
|
||||
rocprofiler_sample_device_counting_service(rocprofiler_context_id_t context_id,
|
||||
rocprofiler_user_data_t user_data,
|
||||
rocprofiler_counter_flag_t flags,
|
||||
rocprofiler_record_counter_t* output_records,
|
||||
size_t* rec_count) ROCPROFILER_API;
|
||||
|
||||
/** @} */
|
||||
|
||||
|
||||
+28
-24
@@ -156,6 +156,10 @@ agent_async_handler(hsa_signal_value_t /*signal_v*/, void* data)
|
||||
{
|
||||
val.user_data = callback_data.user_data;
|
||||
val.agent_id = prof_config->agent->id;
|
||||
if(callback_data.cached_counters)
|
||||
{
|
||||
callback_data.cached_counters->push_back(val);
|
||||
}
|
||||
buf->emplace(
|
||||
ROCPROFILER_BUFFER_CATEGORY_COUNTERS, ROCPROFILER_COUNTER_RECORD_VALUE, val);
|
||||
}
|
||||
@@ -253,9 +257,10 @@ init_callback_data(rocprofiler::counters::agent_callback_data& callback_data,
|
||||
* and trigger the async handler manually.
|
||||
*/
|
||||
rocprofiler_status_t
|
||||
read_agent_ctx(const context::context* ctx,
|
||||
rocprofiler_user_data_t user_data,
|
||||
rocprofiler_counter_flag_t flags)
|
||||
read_agent_ctx(const context::context* ctx,
|
||||
rocprofiler_user_data_t user_data,
|
||||
rocprofiler_counter_flag_t flags,
|
||||
std::vector<rocprofiler_record_counter_t>* out_counters)
|
||||
{
|
||||
rocprofiler_status_t status = ROCPROFILER_STATUS_SUCCESS;
|
||||
if(!ctx->device_counter_collection)
|
||||
@@ -282,6 +287,18 @@ read_agent_ctx(const context::context* ctx,
|
||||
|
||||
for(auto& callback_data : agent_ctx.agent_data)
|
||||
{
|
||||
auto wait_if_sync = [&]() {
|
||||
if((flags & ROCPROFILER_COUNTER_FLAG_ASYNC) == 0)
|
||||
{
|
||||
// Wait for any inprogress samples to complete before returning
|
||||
hsa::get_core_table()->hsa_signal_wait_relaxed_fn(callback_data.completion,
|
||||
HSA_SIGNAL_CONDITION_EQ,
|
||||
1,
|
||||
UINT64_MAX,
|
||||
HSA_WAIT_STATE_ACTIVE);
|
||||
}
|
||||
};
|
||||
|
||||
if(!callback_data.profile || !callback_data.set_profile) continue;
|
||||
const auto* agent = agent::get_agent_cache(callback_data.profile->agent);
|
||||
|
||||
@@ -295,6 +312,11 @@ read_agent_ctx(const context::context* ctx,
|
||||
// No AQL packet, nothing to do here.
|
||||
if(!callback_data.packet) continue;
|
||||
|
||||
wait_if_sync();
|
||||
|
||||
if((flags & ROCPROFILER_COUNTER_FLAG_ASYNC) == 0)
|
||||
callback_data.cached_counters = out_counters;
|
||||
|
||||
// If we have no hardware counters but a packet. The caller is expecting
|
||||
// non-hardware based counter values to be returned. We can skip packet injection
|
||||
// and trigger the async handler directly
|
||||
@@ -302,16 +324,7 @@ read_agent_ctx(const context::context* ctx,
|
||||
{
|
||||
callback_data.user_data = user_data;
|
||||
hsa::get_core_table()->hsa_signal_store_relaxed_fn(callback_data.completion, -1);
|
||||
// Wait for the barrier/read packet to complete
|
||||
if(flags != ROCPROFILER_COUNTER_FLAG_ASYNC)
|
||||
{
|
||||
// Wait for any inprogress samples to complete before returning
|
||||
hsa::get_core_table()->hsa_signal_wait_relaxed_fn(callback_data.completion,
|
||||
HSA_SIGNAL_CONDITION_EQ,
|
||||
1,
|
||||
UINT64_MAX,
|
||||
HSA_WAIT_STATE_ACTIVE);
|
||||
}
|
||||
wait_if_sync();
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -334,17 +347,8 @@ read_agent_ctx(const context::context* ctx,
|
||||
hsa::get_core_table()->hsa_signal_store_relaxed_fn(callback_data.completion, 0);
|
||||
callback_data.user_data = user_data;
|
||||
submitPacket(agent->profile_queue(), &barrier.barrier_and);
|
||||
|
||||
// Wait for the barrier/read packet to complete
|
||||
if(flags != ROCPROFILER_COUNTER_FLAG_ASYNC)
|
||||
{
|
||||
// Wait for any inprogress samples to complete before returning
|
||||
hsa::get_core_table()->hsa_signal_wait_relaxed_fn(callback_data.completion,
|
||||
HSA_SIGNAL_CONDITION_EQ,
|
||||
1,
|
||||
UINT64_MAX,
|
||||
HSA_WAIT_STATE_ACTIVE);
|
||||
}
|
||||
wait_if_sync();
|
||||
if((flags & ROCPROFILER_COUNTER_FLAG_ASYNC) == 0) callback_data.cached_counters = nullptr;
|
||||
}
|
||||
|
||||
agent_ctx.status.exchange(rocprofiler::context::device_counting_service::state::ENABLED);
|
||||
|
||||
+11
-8
@@ -27,6 +27,7 @@
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/hsa.h>
|
||||
#include <rocprofiler-sdk/rocprofiler.h>
|
||||
#include <cstddef>
|
||||
|
||||
namespace rocprofiler
|
||||
{
|
||||
@@ -55,11 +56,12 @@ struct agent_callback_data
|
||||
rocprofiler_user_data_t user_data = {.value = 0};
|
||||
rocprofiler_user_data_t callback_data = {.value = 0};
|
||||
|
||||
std::shared_ptr<rocprofiler::counters::profile_config> profile = {};
|
||||
rocprofiler_agent_id_t agent_id = {.handle = 0};
|
||||
rocprofiler_device_counting_service_callback_t cb = nullptr;
|
||||
rocprofiler_buffer_id_t buffer = {.handle = 0};
|
||||
bool set_profile = false;
|
||||
std::shared_ptr<rocprofiler::counters::profile_config> profile = {};
|
||||
rocprofiler_agent_id_t agent_id = {.handle = 0};
|
||||
rocprofiler_device_counting_service_callback_t cb = nullptr;
|
||||
rocprofiler_buffer_id_t buffer = {.handle = 0};
|
||||
bool set_profile = false;
|
||||
std::vector<rocprofiler_record_counter_t>* cached_counters = nullptr;
|
||||
|
||||
agent_callback_data() = default;
|
||||
agent_callback_data(agent_callback_data&& rhs) noexcept
|
||||
@@ -115,9 +117,10 @@ stop_agent_ctx(const context::context* ctx);
|
||||
// read calls are not allowed in ASYNC mode and will result in
|
||||
// this call waiting for the previous sample to complete.
|
||||
rocprofiler_status_t
|
||||
read_agent_ctx(const context::context* ctx,
|
||||
rocprofiler_user_data_t user_data,
|
||||
rocprofiler_counter_flag_t flags);
|
||||
read_agent_ctx(const context::context* ctx,
|
||||
rocprofiler_user_data_t user_data,
|
||||
rocprofiler_counter_flag_t flags,
|
||||
std::vector<rocprofiler_record_counter_t>* out_counters);
|
||||
|
||||
uint64_t
|
||||
submitPacket(hsa_queue_t* queue, const void* packet);
|
||||
|
||||
+51
-12
@@ -108,10 +108,10 @@ test_init()
|
||||
hsa::get_queue_controller()->init(get_api_table(), get_ext_table());
|
||||
}
|
||||
|
||||
std::vector<rocprofiler_record_counter_t>&
|
||||
common::Synchronized<std::vector<rocprofiler_record_counter_t>>&
|
||||
global_recs()
|
||||
{
|
||||
static std::vector<rocprofiler_record_counter_t> recs;
|
||||
static common::Synchronized<std::vector<rocprofiler_record_counter_t>> recs;
|
||||
return recs;
|
||||
}
|
||||
|
||||
@@ -146,7 +146,7 @@ check_output_created(rocprofiler_context_id_t,
|
||||
}
|
||||
found_value = record->user_data.value;
|
||||
// ROCP_ERROR << fmt::format("Found counter value: {}", record->counter_value);
|
||||
global_recs().push_back(*record);
|
||||
global_recs().wlock([&](auto& data) { data.push_back(*record); });
|
||||
}
|
||||
}
|
||||
|
||||
@@ -319,6 +319,7 @@ protected:
|
||||
size_t track_metric = 0;
|
||||
for(auto& metric : metrics)
|
||||
{
|
||||
std::vector<rocprofiler_record_counter_t> output_records(10000);
|
||||
// global_recs().clear();
|
||||
track_metric++;
|
||||
ROCP_ERROR << "Testing metric " << metric.name();
|
||||
@@ -402,9 +403,23 @@ protected:
|
||||
HSA_WAIT_STATE_BLOCKED);
|
||||
|
||||
// Sample the counting service.
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_sample_device_counting_service(ctx, {.value = track_metric}, flags),
|
||||
"Could not sample");
|
||||
|
||||
if(flags == ROCPROFILER_COUNTER_FLAG_ASYNC)
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_sample_device_counting_service(
|
||||
ctx, {.value = track_metric}, flags, nullptr, nullptr),
|
||||
"Could not sample");
|
||||
}
|
||||
else
|
||||
{
|
||||
global_recs().wlock([&](auto& _data) { _data.clear(); });
|
||||
size_t out_count = output_records.size();
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_sample_device_counting_service(
|
||||
ctx, {.value = track_metric}, flags, output_records.data(), &out_count),
|
||||
"Could not sample");
|
||||
output_records.resize(out_count);
|
||||
}
|
||||
ROCPROFILER_CALL(rocprofiler_stop_context(ctx), "Could not stop context");
|
||||
rocprofiler_flush_buffer(opt_buff_id);
|
||||
|
||||
@@ -417,6 +432,27 @@ protected:
|
||||
{
|
||||
ROCP_FATAL << "Failed to get data for " << metric.name();
|
||||
}
|
||||
else if(flags != ROCPROFILER_COUNTER_FLAG_ASYNC)
|
||||
{
|
||||
auto recs_local = global_recs().rlock([](const auto& data) { return data; });
|
||||
|
||||
if(recs_local.size() != output_records.size())
|
||||
{
|
||||
ROCP_FATAL << "Output size does not match: " << recs_local.size() << " "
|
||||
<< output_records.size();
|
||||
}
|
||||
if(!std::equal(recs_local.begin(),
|
||||
recs_local.end(),
|
||||
output_records.begin(),
|
||||
[](const auto& a, const auto& b) {
|
||||
return a.id == b.id && a.counter_value == b.counter_value &&
|
||||
a.dispatch_id == b.dispatch_id &&
|
||||
a.agent_id.handle == b.agent_id.handle;
|
||||
}))
|
||||
{
|
||||
ROCP_FATAL << "Output does not match between buffer and callback";
|
||||
}
|
||||
}
|
||||
}
|
||||
hsa_signal_destroy(completion_signal);
|
||||
hsa_signal_destroy(found_data);
|
||||
@@ -599,9 +635,10 @@ TEST_F(device_counting_service_test, async_counters) { test_run(ROCPROFILER_COUN
|
||||
TEST_F(device_counting_service_test, sync_grbm_verify)
|
||||
{
|
||||
test_run(ROCPROFILER_COUNTER_FLAG_NONE, {"GRBM_COUNT"}, 50000);
|
||||
ROCP_ERROR << global_recs().size();
|
||||
auto local_recs = global_recs().rlock([](const auto& data) { return data; });
|
||||
ROCP_ERROR << local_recs.size();
|
||||
|
||||
for(const auto& val : global_recs())
|
||||
for(const auto& val : local_recs)
|
||||
{
|
||||
rocprofiler_counter_id_t id;
|
||||
rocprofiler_query_record_counter_id(val.id, &id);
|
||||
@@ -615,9 +652,10 @@ TEST_F(device_counting_service_test, sync_grbm_verify)
|
||||
TEST_F(device_counting_service_test, sync_gpu_util_verify)
|
||||
{
|
||||
test_run(ROCPROFILER_COUNTER_FLAG_NONE, {"GPU_UTIL"}, 50000);
|
||||
ROCP_ERROR << global_recs().size();
|
||||
auto local_recs = global_recs().rlock([](const auto& data) { return data; });
|
||||
ROCP_ERROR << local_recs.size();
|
||||
|
||||
for(const auto& val : global_recs())
|
||||
for(const auto& val : local_recs)
|
||||
{
|
||||
rocprofiler_counter_id_t id;
|
||||
rocprofiler_query_record_counter_id(val.id, &id);
|
||||
@@ -631,9 +669,10 @@ TEST_F(device_counting_service_test, sync_gpu_util_verify)
|
||||
TEST_F(device_counting_service_test, sync_sq_waves_verify)
|
||||
{
|
||||
test_run(ROCPROFILER_COUNTER_FLAG_NONE, {"SQ_WAVES_sum"}, 50000);
|
||||
ROCP_ERROR << global_recs().size();
|
||||
auto local_recs = global_recs().rlock([](const auto& data) { return data; });
|
||||
ROCP_ERROR << local_recs.size();
|
||||
|
||||
for(const auto& val : global_recs())
|
||||
for(const auto& val : local_recs)
|
||||
{
|
||||
rocprofiler_counter_id_t id;
|
||||
rocprofiler_query_record_counter_id(val.id, &id);
|
||||
|
||||
+30
-4
@@ -27,6 +27,8 @@
|
||||
#include "lib/rocprofiler-sdk/counters/device_counting.hpp"
|
||||
#include "rocprofiler-sdk/fwd.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
extern "C" {
|
||||
rocprofiler_status_t
|
||||
rocprofiler_configure_device_counting_service(rocprofiler_context_id_t context_id,
|
||||
@@ -40,11 +42,35 @@ rocprofiler_configure_device_counting_service(rocprofiler_context_id_t context_i
|
||||
}
|
||||
|
||||
rocprofiler_status_t
|
||||
rocprofiler_sample_device_counting_service(rocprofiler_context_id_t context_id,
|
||||
rocprofiler_user_data_t user_data,
|
||||
rocprofiler_counter_flag_t flags)
|
||||
rocprofiler_sample_device_counting_service(rocprofiler_context_id_t context_id,
|
||||
rocprofiler_user_data_t user_data,
|
||||
rocprofiler_counter_flag_t flags,
|
||||
rocprofiler_record_counter_t* output_records,
|
||||
size_t* rec_count)
|
||||
{
|
||||
if(output_records != nullptr)
|
||||
{
|
||||
if((flags & ROCPROFILER_COUNTER_FLAG_ASYNC) != 0)
|
||||
return ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
CHECK(rec_count);
|
||||
auto recs = std::vector<rocprofiler_record_counter_t>{};
|
||||
auto status = rocprofiler::counters::read_agent_ctx(
|
||||
rocprofiler::context::get_registered_context(context_id), user_data, flags, &recs);
|
||||
if(status == ROCPROFILER_STATUS_SUCCESS)
|
||||
{
|
||||
if(recs.size() > *rec_count)
|
||||
{
|
||||
*rec_count = recs.size();
|
||||
return ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
*rec_count = recs.size();
|
||||
std::memcpy(
|
||||
output_records, recs.data(), sizeof(rocprofiler_record_counter_t) * recs.size());
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
return rocprofiler::counters::read_agent_ctx(
|
||||
rocprofiler::context::get_registered_context(context_id), user_data, flags);
|
||||
rocprofiler::context::get_registered_context(context_id), user_data, flags, nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user