rocprofiler_kernel_dispatch_info_t + header record for buffered counter collection (#758)

* Update include/rocprofiler-sdk

- defines.h
  - ROCPROFILER_VERSION_10_0 -> ROCPROFILER_SDK_VERSION_0_0
- fwd.h
  - rocprofiler_counter_record_kind_t
  - rocprofiler_kernel_dispatch_info_t
  - rocprofiler_record_counter_t
    - has dispatch id instead of correlation id
  - rocprofiler_counter_info_v0_t
    - added rocprofiler_counter_id_t field
    - added is_constant field
    - reordered better packing
- dispatch_profile.h
  - added rocprofiler_profile_counting_dispatch_record_t for use as a header record for rocprofiler_profile_counting_dispatch_data_t
- callback_tracing.h
  - rocprofiler_callback_tracing_kernel_dispatch_data_t uses rocprofiler_kernel_dispatch_info_t
- buffer_tracing.h
  - rocprofiler_buffer_tracing_kernel_dispatch_record_t uses rocprofiler_kernel_dispatch_info_t

* Update lib/rocprofiler-sdk/*

- transition to rocprofiler_kernel_dispatch_info_t
- set id and is_constant values for rocprofiler_counter_info_v0_t in rocprofiler_query_counter_info

* Update lib/rocprofiler-sdk-tool

- transition to rocprofiler_kernel_dispatch_info_t

* Update lib/rocprofiler-sdk/counters/tests/core.cpp

- transition to rocprofiler_kernel_dispatch_info_t

* Update samples

- transition to rocprofiler_kernel_dispatch_info_t
- transition to rocprofiler_counter_record_kind_t

* Update tests

- transition to rocprofiler_kernel_dispatch_info_t
- transition to rocprofiler_counter_record_kind_t
- improve integration test validation for counter-collection
- update serialization for new/additional types

* Fix tests/counter-collection/validate.py

- loosen restrictions on the length of counter description

* Update include/rocprofiler-sdk/buffer_tracing.h

- remove accidental packed attribute

* Update lib/rocprofiler-sdk/counters/xml/derived_counters.xml

- Add description for TCC_TAG_STALL_sum (reference: https://rocm.docs.amd.com/en/develop/conceptual/gpu-arch/mi300-mi200-performance-counters.html)

* Update tests/page-migration/validate.py

[ROCm/rocprofiler-sdk commit: 07537b6231]
Šī revīzija ir iekļauta:
Jonathan R. Madsen
2024-04-12 17:30:34 -05:00
revīziju iesūtīja GitHub
vecāks 4f99edbad5
revīzija 2aef3c3d15
23 mainīti faili ar 489 papildinājumiem un 273 dzēšanām
@@ -60,6 +60,7 @@
#include <iostream>
#include <map>
#include <mutex>
#include <sstream>
#include <string>
#include <string_view>
#include <thread>
@@ -222,19 +223,22 @@ tool_tracing_callback(rocprofiler_context_id_t context,
auto info = std::stringstream{};
info << "agent_id=" << record->agent_id.handle
<< ", queue_id=" << record->queue_id.handle << ", kernel_id=" << record->kernel_id
<< ", kernel=" << client_kernels.at(record->kernel_id).kernel_name
info << "agent_id=" << record->dispatch_info.agent_id.handle
<< ", queue_id=" << record->dispatch_info.queue_id.handle
<< ", kernel_id=" << record->dispatch_info.kernel_id
<< ", kernel=" << client_kernels.at(record->dispatch_info.kernel_id).kernel_name
<< ", context=" << context.handle << ", buffer_id=" << buffer_id.handle
<< ", cid=" << record->correlation_id.internal
<< ", extern_cid=" << record->correlation_id.external.value
<< ", kind=" << record->kind << ", start=" << record->start_timestamp
<< ", stop=" << record->end_timestamp
<< ", private_segment_size=" << record->private_segment_size
<< ", group_segment_size=" << record->group_segment_size << ", workgroup_size=("
<< record->workgroup_size.x << "," << record->workgroup_size.y << ","
<< record->workgroup_size.z << "), grid_size=(" << record->grid_size.x << ","
<< record->grid_size.y << "," << record->grid_size.z << ")";
<< ", private_segment_size=" << record->dispatch_info.private_segment_size
<< ", group_segment_size=" << record->dispatch_info.group_segment_size
<< ", workgroup_size=(" << record->dispatch_info.workgroup_size.x << ","
<< record->dispatch_info.workgroup_size.y << ","
<< record->dispatch_info.workgroup_size.z << "), grid_size=("
<< record->dispatch_info.grid_size.x << "," << record->dispatch_info.grid_size.y
<< "," << record->dispatch_info.grid_size.z << ")";
if(record->start_timestamp > record->end_timestamp)
throw std::runtime_error("kernel dispatch: start > end");
@@ -267,7 +271,10 @@ tool_tracing_callback(rocprofiler_context_id_t context,
}
else
{
throw std::runtime_error{"unexpected rocprofiler_record_header_t category + kind"};
auto _msg = std::stringstream{};
_msg << "unexpected rocprofiler_record_header_t category + kind: (" << header->category
<< " + " << header->kind << ")";
throw std::runtime_error{_msg.str()};
}
}
}
@@ -74,12 +74,13 @@ record_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
void* callback_data_args)
{
std::stringstream ss;
ss << "Kernel_id " << dispatch_data.kernel_id << ": ";
ss << "Dispatch_Id=" << dispatch_data.dispatch_info.dispatch_id
<< ", Kernel_id=" << dispatch_data.dispatch_info.kernel_id
<< ", Corr_Id=" << dispatch_data.correlation_id.internal << ": ";
for(size_t i = 0; i < record_count; ++i)
{
ss << "(Id: " << record_data[i].id << " Value [D]: " << record_data[i].counter_value
<< " Corr_Id: " << record_data[i].correlation_id.internal << "),";
}
<< "),";
auto* output_stream = static_cast<std::ostream*>(callback_data_args);
if(!output_stream) throw std::runtime_error{"nullptr to output stream"};
*output_stream << "[" << __FUNCTION__ << "] " << ss.str() << "\n";
@@ -110,7 +111,8 @@ dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
static std::unordered_map<uint64_t, rocprofiler_profile_config_id_t> profile_cache = {};
auto search_cache = [&]() {
if(auto pos = profile_cache.find(dispatch_data.agent_id.handle); pos != profile_cache.end())
if(auto pos = profile_cache.find(dispatch_data.dispatch_info.agent_id.handle);
pos != profile_cache.end())
{
*config = pos->second;
return true;
@@ -133,7 +135,7 @@ dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
// Iterate through the agents and get the counters available on that agent
ROCPROFILER_CALL(rocprofiler_iterate_agent_supported_counters(
dispatch_data.agent_id,
dispatch_data.dispatch_info.agent_id,
[](rocprofiler_agent_id_t,
rocprofiler_counter_id_t* counters,
size_t num_counters,
@@ -167,12 +169,13 @@ dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
// Create a colleciton profile for the counters
rocprofiler_profile_config_id_t profile;
ROCPROFILER_CALL(
rocprofiler_create_profile_config(
dispatch_data.agent_id, collect_counters.data(), collect_counters.size(), &profile),
"Could not construct profile cfg");
ROCPROFILER_CALL(rocprofiler_create_profile_config(dispatch_data.dispatch_info.agent_id,
collect_counters.data(),
collect_counters.size(),
&profile),
"Could not construct profile cfg");
profile_cache.emplace(dispatch_data.agent_id.handle, profile);
profile_cache.emplace(dispatch_data.dispatch_info.agent_id.handle, profile);
// Return the profile to collect those counters for this dispatch
*config = profile;
}
@@ -95,12 +95,23 @@ buffered_callback(rocprofiler_context_id_t,
for(size_t i = 0; i < num_headers; ++i)
{
auto* header = headers[i];
if(header->category == ROCPROFILER_BUFFER_CATEGORY_COUNTERS && header->kind == 0)
if(header->category == ROCPROFILER_BUFFER_CATEGORY_COUNTERS &&
header->kind == ROCPROFILER_COUNTER_RECORD_PROFILE_COUNTING_DISPATCH_HEADER)
{
// Print the returned counter data.
auto* record =
static_cast<rocprofiler_profile_counting_dispatch_record_t*>(header->payload);
ss << "[Dispatch_Id: " << record->dispatch_info.dispatch_id
<< " Kernel_ID: " << record->dispatch_info.kernel_id
<< " Corr_Id: " << record->correlation_id.internal << ")]\n";
}
else if(header->category == ROCPROFILER_BUFFER_CATEGORY_COUNTERS &&
header->kind == ROCPROFILER_COUNTER_RECORD_VALUE)
{
// Print the returned counter data.
auto* record = static_cast<rocprofiler_record_counter_t*>(header->payload);
ss << "(Id: " << record->id << " Value [D]: " << record->counter_value
<< " Corr_Id: " << record->correlation_id.internal << "),";
ss << " (Dispatch_Id: " << record->dispatch_id << " Id: " << record->id
<< " Value [D]: " << record->counter_value << "),";
}
}
@@ -133,7 +144,8 @@ dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
static std::unordered_map<uint64_t, rocprofiler_profile_config_id_t> profile_cache = {};
auto search_cache = [&]() {
if(auto pos = profile_cache.find(dispatch_data.agent_id.handle); pos != profile_cache.end())
if(auto pos = profile_cache.find(dispatch_data.dispatch_info.agent_id.handle);
pos != profile_cache.end())
{
*config = pos->second;
return true;
@@ -156,7 +168,7 @@ dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
// Iterate through the agents and get the counters available on that agent
ROCPROFILER_CALL(rocprofiler_iterate_agent_supported_counters(
dispatch_data.agent_id,
dispatch_data.dispatch_info.agent_id,
[](rocprofiler_agent_id_t,
rocprofiler_counter_id_t* counters,
size_t num_counters,
@@ -190,12 +202,13 @@ dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
// Create a colleciton profile for the counters
rocprofiler_profile_config_id_t profile;
ROCPROFILER_CALL(
rocprofiler_create_profile_config(
dispatch_data.agent_id, collect_counters.data(), collect_counters.size(), &profile),
"Could not construct profile cfg");
ROCPROFILER_CALL(rocprofiler_create_profile_config(dispatch_data.dispatch_info.agent_id,
collect_counters.data(),
collect_counters.size(),
&profile),
"Could not construct profile cfg");
profile_cache.emplace(dispatch_data.agent_id.handle, profile);
profile_cache.emplace(dispatch_data.dispatch_info.agent_id.handle, profile);
// Return the profile to collect those counters for this dispatch
*config = profile;
}
@@ -170,7 +170,8 @@ buffered_callback(rocprofiler_context_id_t,
for(size_t i = 0; i < num_headers; ++i)
{
auto* header = headers[i];
if(header->category == ROCPROFILER_BUFFER_CATEGORY_COUNTERS && header->kind == 0)
if(header->category == ROCPROFILER_BUFFER_CATEGORY_COUNTERS &&
header->kind == ROCPROFILER_COUNTER_RECORD_VALUE)
{
// Record the counters we have in the buffer and the number of instances of
// the counter we have seen.
@@ -242,7 +243,7 @@ dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
{
std::vector<rocprofiler_counter_id_t> counters_needed;
ROCPROFILER_CALL(rocprofiler_iterate_agent_supported_counters(
dispatch_data.agent_id,
dispatch_data.dispatch_info.agent_id,
[](rocprofiler_agent_id_t,
rocprofiler_counter_id_t* counters,
size_t num_counters,
@@ -269,7 +270,7 @@ dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
cap.expected_counter_names.emplace(found_counter.handle, std::string(version.name));
size_t expected = 0;
ROCPROFILER_CALL(rocprofiler_query_counter_instance_count(
dispatch_data.agent_id, found_counter, &expected),
dispatch_data.dispatch_info.agent_id, found_counter, &expected),
"COULD NOT QUERY INSTANCES");
cap.remaining.push_back(found_counter);
cap.expected.emplace(found_counter.handle, expected);
@@ -297,8 +298,9 @@ dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
}
if(cap.expected.empty())
{
std::clog << "No counters found for agent " << dispatch_data.agent_id.handle << " ("
<< agents.at(dispatch_data.agent_id.handle)->name << ")";
std::clog << "No counters found for agent "
<< dispatch_data.dispatch_info.agent_id.handle << " ("
<< agents.at(dispatch_data.dispatch_info.agent_id.handle)->name << ")";
}
}
if(cap.remaining.empty()) return;
@@ -306,9 +308,10 @@ dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
rocprofiler_profile_config_id_t profile;
// Select the next counter to collect.
ROCPROFILER_CALL(rocprofiler_create_profile_config(
dispatch_data.agent_id, &(cap.remaining.back()), 1, &profile),
"Could not construct profile cfg");
ROCPROFILER_CALL(
rocprofiler_create_profile_config(
dispatch_data.dispatch_info.agent_id, &(cap.remaining.back()), 1, &profile),
"Could not construct profile cfg");
cap.remaining.pop_back();
*config = profile;
@@ -99,8 +99,9 @@ typedef struct
/// ::ROCPROFILER_CALLBACK_TRACING_HSA_IMAGE_EXT_API, or
/// ::ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API
/// @var operation
/// @brief ::rocprofiler_hsa_core_api_id_t, ::rocprofiler_hsa_amd_ext_api_id_t,
/// ::rocprofiler_hsa_image_ext_api_id_t, or ::rocprofiler_hsa_finalize_ext_api_id_t
/// @brief Specification of the API function, e.g., ::rocprofiler_hsa_core_api_id_t,
/// ::rocprofiler_hsa_amd_ext_api_id_t, ::rocprofiler_hsa_image_ext_api_id_t, or
/// ::rocprofiler_hsa_finalize_ext_api_id_t
} rocprofiler_buffer_tracing_hsa_api_record_t;
/**
@@ -120,7 +121,8 @@ typedef struct
/// @brief ::ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API or
/// ::ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API
/// @var operation
/// @brief ::rocprofiler_hip_runtime_api_id_t or ::rocprofiler_hip_compiler_api_id_t
/// @brief Specification of the API function, e.g., ::rocprofiler_hip_runtime_api_id_t or
/// ::rocprofiler_hip_compiler_api_id_t
} rocprofiler_buffer_tracing_hip_api_record_t;
/**
@@ -140,7 +142,8 @@ typedef struct
/// @brief ::ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API,
/// ::ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API, or
/// ::ROCPROFILER_CALLBACK_TRACING_MARKER_NAME_API
/// @brief ::rocprofiler_marker_core_api_id_t, ::rocprofiler_marker_control_api_id_t, or
/// @brief Specification of the API function, e.g., ::rocprofiler_marker_core_api_id_t,
/// ::rocprofiler_marker_control_api_id_t, or
/// ::rocprofiler_marker_name_api_id_t
} rocprofiler_buffer_tracing_marker_api_record_t;
@@ -161,7 +164,8 @@ typedef struct
/// @var kind
/// @brief ::ROCPROFILER_BUFFER_TRACING_MEMORY_COPY
/// @var operation
/// @brief memory copy direction (::rocprofiler_memory_copy_operation_t)
/// @brief Specification of the memory copy direction (@see
/// ::rocprofiler_memory_copy_operation_t)
} rocprofiler_buffer_tracing_memory_copy_record_t;
/**
@@ -171,28 +175,17 @@ typedef struct rocprofiler_buffer_tracing_kernel_dispatch_record_t
{
uint64_t size; ///< size of this struct
rocprofiler_buffer_tracing_kind_t kind; ///< ::ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH
rocprofiler_kernel_dispatch_operation_t operation; ///<
rocprofiler_kernel_dispatch_operation_t operation;
rocprofiler_correlation_id_t correlation_id; ///< correlation ids for record
rocprofiler_thread_id_t thread_id; ///< id for thread that launched kernel
rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds
rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds
rocprofiler_agent_id_t agent_id; ///< agent kernel was dispatched on
rocprofiler_queue_id_t queue_id; ///< queue kernel was dispatched on
rocprofiler_kernel_id_t kernel_id; ///< identifier for kernel
rocprofiler_dispatch_id_t dispatch_id; ///< unique id for each dispatch
uint32_t private_segment_size;
uint32_t group_segment_size;
rocprofiler_dim3_t workgroup_size;
rocprofiler_dim3_t grid_size;
rocprofiler_kernel_dispatch_info_t dispatch_info; ///< Dispatch info
/// @var private_segment_size
/// @brief runtime private memory segment size
/// @var group_segment_size
/// @brief runtime group memory segment size
/// @var workgroup_size
/// @brief runtime workgroup size (grid * threads)
/// @var grid_size
/// @brief runtime grid size
/// @var operation
/// @brief Kernel dispatch buffer records only report the ::ROCPROFILER_KERNEL_DISPATCH_COMPLETE
/// operation because there are no "real" wrapper around the enqueuing of an individual kernel
/// dispatch
} rocprofiler_buffer_tracing_kernel_dispatch_record_t;
typedef struct
@@ -232,7 +225,7 @@ typedef struct
/**
* @brief ROCProfiler Buffer Page Migration Tracer Record
*/
typedef struct
typedef struct rocprofiler_buffer_tracing_page_migration_record_t
{
uint64_t size; ///< size of this struct
rocprofiler_buffer_tracing_kind_t kind; ///< ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION
@@ -259,17 +252,16 @@ typedef struct
*/
typedef struct
{
uint64_t size; ///< size of this struct
rocprofiler_buffer_tracing_kind_t kind; ///< ::ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY
rocprofiler_scratch_memory_operation_t
operation; ///< @see rocprofiler_scratch_memory_operation_t
rocprofiler_agent_id_t agent_id; ///< agent kernel was dispatched on
rocprofiler_queue_id_t queue_id; ///< queue kernel was dispatched on
rocprofiler_thread_id_t thread_id; ///< id for thread generating this record
rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds
rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds
rocprofiler_correlation_id_t correlation_id; ///< correlation ids for record
rocprofiler_scratch_alloc_flag_t flags;
uint64_t size; ///< size of this struct
rocprofiler_buffer_tracing_kind_t kind; ///< ::ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY
rocprofiler_scratch_memory_operation_t operation; ///< specification of the kind
rocprofiler_agent_id_t agent_id; ///< agent kernel was dispatched on
rocprofiler_queue_id_t queue_id; ///< queue kernel was dispatched on
rocprofiler_thread_id_t thread_id; ///< id for thread generating this record
rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds
rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds
rocprofiler_correlation_id_t correlation_id; ///< correlation ids for record
rocprofiler_scratch_alloc_flag_t flags;
} rocprofiler_buffer_tracing_scratch_memory_record_t;
/**
@@ -286,7 +278,9 @@ typedef struct
/// @brief ::ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT
/// @var timestamp
/// @brief Timestamp (in nanosec) of when rocprofiler detected the correlation ID could be
/// retired
/// retired. Due to clock skew between the CPU and GPU, this may at times, *appear* to be before
/// the kernel or memory copy completed but the reality is that if this ever occurred, the API
/// would report a FATAL error
/// @var internal_correlation_id
/// @brief Only internal correlation ID is provided
} rocprofiler_buffer_tracing_correlation_id_retirement_record_t;
@@ -162,26 +162,10 @@ typedef struct
*/
typedef struct rocprofiler_callback_tracing_kernel_dispatch_data_t
{
uint64_t size; ///< size of this struct
rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds
rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds
rocprofiler_agent_id_t agent_id; ///< agent kernel was dispatched on
rocprofiler_queue_id_t queue_id; ///< queue kernel was dispatched on
rocprofiler_kernel_id_t kernel_id; ///< identifier for kernel
rocprofiler_dispatch_id_t dispatch_id; ///< unique id for each dispatch
uint32_t private_segment_size;
uint32_t group_segment_size;
rocprofiler_dim3_t workgroup_size;
rocprofiler_dim3_t grid_size;
/// @var private_segment_size
/// @brief runtime private memory segment size
/// @var group_segment_size
/// @brief runtime group memory segment size
/// @var workgroup_size
/// @brief runtime workgroup size (grid * threads)
/// @var grid_size
/// @brief runtime grid size
uint64_t size; ///< size of this struct
rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds
rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds
rocprofiler_kernel_dispatch_info_t dispatch_info; ///< Dispatch info
} rocprofiler_callback_tracing_kernel_dispatch_data_t;
/**
@@ -39,10 +39,10 @@
*/
/**
* @brief The function was introduced in version 10.0 of the interface and has the
* symbol version string of ``"ROCPROFILER_10.0"``.
* @brief The function was introduced in version 0.0 of the interface and has the
* symbol version string of ``"ROCPROFILER_SDK_0.0"``.
*/
#define ROCPROFILER_VERSION_10_0
#define ROCPROFILER_SDK_VERSION_0_0
/** @} */
@@ -123,7 +123,9 @@
#ifdef __cplusplus
# define ROCPROFILER_EXTERN_C_INIT extern "C" {
# define ROCPROFILER_EXTERN_C_FINI }
# define ROCPROFILER_CXX_CODE(...) __VA_ARGS__
#else
# define ROCPROFILER_EXTERN_C_INIT
# define ROCPROFILER_EXTERN_C_FINI
# define ROCPROFILER_CXX_CODE(...)
#endif
@@ -38,23 +38,29 @@ ROCPROFILER_EXTERN_C_INIT
*/
/**
* @brief Kernel dispatch data for profile counting callbacks
* @brief Kernel dispatch data for profile counting callbacks.
*
*/
typedef struct rocprofiler_profile_counting_dispatch_data_t
{
uint64_t size; ///< Size of this struct
rocprofiler_agent_id_t agent_id; ///< Agent ID where kernel is launched
rocprofiler_queue_id_t queue_id; ///< Queue ID where kernel packet is enqueued
rocprofiler_kernel_id_t kernel_id; ///< Kernel identifier
rocprofiler_dispatch_id_t dispatch_id; ///< unique id for each dispatch
rocprofiler_correlation_id_t correlation_id; ///< Correlation ID for this dispatch
uint32_t private_segment_size; ///< runtime private memory segment size
uint32_t group_segment_size; ///< runtime group memory segment size
rocprofiler_dim3_t workgroup_size; ///< runtime workgroup size (grid * threads)
rocprofiler_dim3_t grid_size; ///< runtime grid size
uint64_t size; ///< Size of this struct
rocprofiler_correlation_id_t correlation_id; ///< Correlation ID for this dispatch
rocprofiler_kernel_dispatch_info_t dispatch_info; ///< Dispatch info
} rocprofiler_profile_counting_dispatch_data_t;
/**
* @brief ROCProfiler Profile Counting Counter Record Header Information
*
* This is buffer equivalent of ::rocprofiler_profile_counting_dispatch_data_t
*/
typedef struct rocprofiler_profile_counting_dispatch_record_t
{
uint64_t size; ///< Size of this struct
uint64_t num_records; ///< number of ::rocprofiler_record_counter_t records
rocprofiler_correlation_id_t correlation_id; ///< Correlation ID for this dispatch
rocprofiler_kernel_dispatch_info_t dispatch_info; ///< Contains the `dispatch_id`
} rocprofiler_profile_counting_dispatch_record_t;
/**
* @brief Kernel Dispatch Callback. This is a callback that is invoked before the kernel
* is enqueued into the HSA queue. What counters to collect for a kernel are set
@@ -364,6 +364,22 @@ typedef enum
ROCPROFILER_COUNTER_INFO_VERSION_LAST,
} rocprofiler_counter_info_version_id_t;
/**
* @brief Enumeration for distinguishing different buffer record kinds within the
* ::ROCPROFILER_BUFFER_CATEGORY_COUNTERS category
*/
typedef enum
{
ROCPROFILER_COUNTER_RECORD_NONE = 0,
ROCPROFILER_COUNTER_RECORD_PROFILE_COUNTING_DISPATCH_HEADER, ///< ::rocprofiler_profile_counting_dispatch_record_t
ROCPROFILER_COUNTER_RECORD_VALUE,
ROCPROFILER_COUNTER_RECORD_LAST,
/// @var ROCPROFILER_COUNTER_RECORD_KIND_DISPATCH_PROFILE_HEADER
/// @brief Indicates the payload type is of type
/// ::rocprofiler_profile_counting_dispatch_record_t
} rocprofiler_counter_record_kind_t;
//--------------------------------------------------------------------------------------//
//
// ALIASES
@@ -591,6 +607,24 @@ rocprofiler_record_header_compute_hash(uint32_t category, uint32_t kind)
return value;
}
/**
* @brief ROCProfiler kernel dispatch information
*
*/
typedef struct rocprofiler_kernel_dispatch_info_t
{
uint64_t size; ///< Size of this struct
rocprofiler_agent_id_t agent_id; ///< Agent ID where kernel is launched
rocprofiler_queue_id_t queue_id; ///< Queue ID where kernel packet is enqueued
rocprofiler_kernel_id_t kernel_id; ///< Kernel identifier
rocprofiler_dispatch_id_t dispatch_id; ///< unique id for each dispatch
uint32_t private_segment_size; ///< runtime private memory segment size
uint32_t group_segment_size; ///< runtime group memory segment size
rocprofiler_dim3_t workgroup_size; ///< runtime workgroup size (grid * threads)
rocprofiler_dim3_t grid_size; ///< runtime grid size
uint8_t reserved_padding[56]; // reserved for extensions w/o ABI break
} rocprofiler_kernel_dispatch_info_t;
/**
* @brief Details for the dimension, including its size, for a counter record.
*/
@@ -611,10 +645,17 @@ typedef struct
{
rocprofiler_counter_instance_id_t id; ///< counter identifier
double counter_value; ///< counter value
rocprofiler_correlation_id_t correlation_id;
rocprofiler_dispatch_id_t dispatch_id;
/// @var correlation_id
/// @brief Used to correlate the kernel data to an API call
/// @var dispatch_id
/// @brief A value greater than zero indicates that this counter record is associated with a
/// specific dispatch.
///
/// This value can be mapped to a dispatch via the `dispatch_info` field (@see
/// ::rocprofiler_kernel_dispatch_info_t) of a ::rocprofiler_profile_counting_dispatch_data_t
/// instance (provided during callback for profile config) or a
/// ::rocprofiler_profile_counting_dispatch_record_t records (which will be insert into the
/// buffer prior to the associated ::rocprofiler_record_counter_t records).
} rocprofiler_record_counter_t;
/**
@@ -622,11 +663,13 @@ typedef struct
*/
typedef struct
{
int is_derived; ///< If this counter is a derived counter
const char* name; ///< Name of the counter
const char* description; ///< Description of the counter
const char* block; ///< Block of the counter (non-derived only)
const char* expression; ///< Counter expression (derived counters only)
rocprofiler_counter_id_t id; ///< Id of this counter
const char* name; ///< Name of the counter
const char* description; ///< Description of the counter
const char* block; ///< Block of the counter (non-derived only)
const char* expression; ///< Counter expression (derived counters only)
uint8_t is_constant : 1; ///< If this counter is HW constant
uint8_t is_derived : 1; ///< If this counter is a derived counter
} rocprofiler_counter_info_v0_t;
/**
@@ -646,3 +689,7 @@ typedef struct
/** @} */
ROCPROFILER_EXTERN_C_FINI
ROCPROFILER_CXX_CODE(
static_assert(sizeof(rocprofiler_kernel_dispatch_info_t) == 128,
"Increasing the size of the kernel dispatch info is not permitted");)
@@ -154,25 +154,26 @@ generate_csv(tool_table* tool_functions, std::vector<kernel_dispatch_ring_buffer
auto kernel_trace_ss = std::stringstream{};
rocprofiler_buffer_tracing_kernel_dispatch_record_t* record = buf.retrieve();
if(record == nullptr) break;
auto kernel_name = tool_functions->tool_get_kernel_name_fn(record->kernel_id);
auto kernel_name =
tool_functions->tool_get_kernel_name_fn(record->dispatch_info.kernel_id);
rocprofiler::tool::csv::kernel_trace_csv_encoder::write_row(
kernel_trace_ss,
tool_functions->tool_get_domain_name_fn(record->kind),
tool_functions->tool_get_agent_node_id_fn(record->agent_id),
record->queue_id.handle,
record->kernel_id,
tool_functions->tool_get_agent_node_id_fn(record->dispatch_info.agent_id),
record->dispatch_info.queue_id.handle,
record->dispatch_info.kernel_id,
kernel_name,
record->correlation_id.internal,
record->start_timestamp,
record->end_timestamp,
record->private_segment_size,
record->group_segment_size,
record->workgroup_size.x,
record->workgroup_size.y,
record->workgroup_size.z,
record->grid_size.x,
record->grid_size.y,
record->grid_size.z);
record->dispatch_info.private_segment_size,
record->dispatch_info.group_segment_size,
record->dispatch_info.workgroup_size.x,
record->dispatch_info.workgroup_size.y,
record->dispatch_info.workgroup_size.z,
record->dispatch_info.grid_size.x,
record->dispatch_info.grid_size.y,
record->dispatch_info.grid_size.z);
if(tool::get_config().stats)
kernel_stats[kernel_name] += (record->end_timestamp - record->start_timestamp);
@@ -355,7 +356,7 @@ generate_csv(tool_table* tool_functions, std::vector<counter_collection_ring_buf
{
rocprofiler_tool_counter_collection_record_t* record = buf.retrieve();
if(record == nullptr) break;
auto kernel_id = record->dispatch_data.kernel_id;
auto kernel_id = record->dispatch_data.dispatch_info.kernel_id;
auto counter_name_value = std::map<std::string, uint64_t>{};
for(const auto& count : record->profiler_record)
{
@@ -379,13 +380,14 @@ generate_csv(tool_table* tool_functions, std::vector<counter_collection_ring_buf
counter_collection_ss,
correlation_id.internal,
record->dispatch_index,
tool_functions->tool_get_agent_node_id_fn(record->dispatch_data.agent_id),
record->dispatch_data.queue_id.handle,
tool_functions->tool_get_agent_node_id_fn(
record->dispatch_data.dispatch_info.agent_id),
record->dispatch_data.dispatch_info.queue_id.handle,
record->pid,
record->thread_id,
magnitude(record->dispatch_data.grid_size),
magnitude(record->dispatch_data.dispatch_info.grid_size),
tool_functions->tool_get_kernel_name_fn(kernel_id),
magnitude(record->dispatch_data.workgroup_size),
magnitude(record->dispatch_data.dispatch_info.workgroup_size),
record->lds_block_size_v,
record->private_segment_size,
record->arch_vgpr_count,
@@ -1246,8 +1246,8 @@ dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
rocprofiler_user_data_t* user_data,
void* /*callback_data_args*/)
{
auto kernel_id = dispatch_data.kernel_id;
auto agent_id = dispatch_data.agent_id;
auto kernel_id = dispatch_data.dispatch_info.kernel_id;
auto agent_id = dispatch_data.dispatch_info.agent_id;
if(!is_targeted_kernel(kernel_id))
{
@@ -1283,7 +1283,7 @@ counter_record_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_da
rocprofiler_user_data_t user_data,
void* /*callback_data_args*/)
{
auto kernel_id = dispatch_data.kernel_id;
auto kernel_id = dispatch_data.dispatch_info.kernel_id;
const auto* cnt_dispatch_data_v = static_cast<counter_dispatch_data*>(user_data.ptr);
rocprofiler_tool_counter_collection_record_t counter_record;
@@ -51,7 +51,7 @@ extern "C" {
* @retval ROCPROFILER_STATUS_ERROR_COUNTER_NOT_FOUND if counter not found
* @retval ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_ABI Version is not supported
*/
rocprofiler_status_t ROCPROFILER_API
rocprofiler_status_t
rocprofiler_query_counter_info(rocprofiler_counter_id_t counter_id,
rocprofiler_counter_info_version_id_t version,
void* info)
@@ -64,6 +64,8 @@ rocprofiler_query_counter_info(rocprofiler_counter_id_t counter_id,
if(const auto* metric_ptr = rocprofiler::common::get_val(id_map, counter_id.handle))
{
out_struct.id = counter_id;
out_struct.is_constant = (metric_ptr->special().empty()) ? 0 : 1;
out_struct.is_derived = (metric_ptr->expression().empty()) ? 0 : 1;
out_struct.name = metric_ptr->name().c_str();
out_struct.description = metric_ptr->description().c_str();
@@ -326,19 +326,23 @@ queue_cb(const context::context* ctx,
auto dispatch_data =
common::init_public_api_struct(rocprofiler_profile_counting_dispatch_data_t{});
dispatch_data.kernel_id = kernel_id;
dispatch_data.dispatch_id = dispatch_id;
dispatch_data.agent_id = CHECK_NOTNULL(queue.get_agent().get_rocp_agent())->id;
dispatch_data.queue_id = queue.get_id();
dispatch_data.correlation_id = _corr_id_v;
dispatch_data.private_segment_size = pkt.kernel_dispatch.private_segment_size;
dispatch_data.group_segment_size = pkt.kernel_dispatch.group_segment_size;
dispatch_data.workgroup_size = {pkt.kernel_dispatch.workgroup_size_x,
pkt.kernel_dispatch.workgroup_size_y,
pkt.kernel_dispatch.workgroup_size_z};
dispatch_data.grid_size = {pkt.kernel_dispatch.grid_size_x,
pkt.kernel_dispatch.grid_size_y,
pkt.kernel_dispatch.grid_size_z};
dispatch_data.correlation_id = _corr_id_v;
{
auto dispatch_info = common::init_public_api_struct(rocprofiler_kernel_dispatch_info_t{});
dispatch_info.kernel_id = kernel_id;
dispatch_info.dispatch_id = dispatch_id;
dispatch_info.agent_id = CHECK_NOTNULL(queue.get_agent().get_rocp_agent())->id;
dispatch_info.queue_id = queue.get_id();
dispatch_info.private_segment_size = pkt.kernel_dispatch.private_segment_size;
dispatch_info.group_segment_size = pkt.kernel_dispatch.group_segment_size;
dispatch_info.workgroup_size = {pkt.kernel_dispatch.workgroup_size_x,
pkt.kernel_dispatch.workgroup_size_y,
pkt.kernel_dispatch.workgroup_size_z};
dispatch_info.grid_size = {pkt.kernel_dispatch.grid_size_x,
pkt.kernel_dispatch.grid_size_y,
pkt.kernel_dispatch.grid_size_z};
dispatch_data.dispatch_info = dispatch_info;
}
info->user_cb(dispatch_data, &req_profile, user_data, info->callback_args);
@@ -441,6 +445,7 @@ completed_cb(const context::context* ctx,
}
}
auto _dispatch_id = session.callback_record.dispatch_info.dispatch_id;
for(auto& ast : prof_config->asts)
{
std::vector<std::unique_ptr<std::vector<rocprofiler_record_counter_t>>> cache;
@@ -448,35 +453,47 @@ completed_cb(const context::context* ctx,
CHECK(ret);
ast.set_out_id(*ret);
out.reserve(out.size() + ret->size());
for(auto& val : *ret)
{
val.correlation_id = _corr_id_v;
if(buf)
buf->emplace(ROCPROFILER_BUFFER_CATEGORY_COUNTERS, 0, val);
else
out.push_back(val);
val.dispatch_id = _dispatch_id;
out.emplace_back(val);
}
}
if(!out.empty())
{
CHECK(info->record_callback);
if(buf)
{
auto _header =
common::init_public_api_struct(rocprofiler_profile_counting_dispatch_record_t{});
_header.num_records = out.size();
_header.correlation_id = _corr_id_v;
_header.dispatch_info = session.callback_record.dispatch_info;
buf->emplace(ROCPROFILER_BUFFER_CATEGORY_COUNTERS,
ROCPROFILER_COUNTER_RECORD_PROFILE_COUNTING_DISPATCH_HEADER,
_header);
auto dispatch_data =
common::init_public_api_struct(rocprofiler_profile_counting_dispatch_data_t{});
for(auto itr : out)
buf->emplace(
ROCPROFILER_BUFFER_CATEGORY_COUNTERS, ROCPROFILER_COUNTER_RECORD_VALUE, itr);
}
else
{
CHECK(info->record_callback);
dispatch_data.kernel_id = session.callback_record.kernel_id;
dispatch_data.dispatch_id = session.callback_record.dispatch_id;
dispatch_data.agent_id = session.callback_record.agent_id;
dispatch_data.queue_id = session.callback_record.queue_id;
dispatch_data.correlation_id = _corr_id_v;
dispatch_data.private_segment_size = session.callback_record.private_segment_size;
dispatch_data.group_segment_size = session.callback_record.group_segment_size;
dispatch_data.workgroup_size = session.callback_record.workgroup_size;
dispatch_data.grid_size = session.callback_record.grid_size;
auto dispatch_data =
common::init_public_api_struct(rocprofiler_profile_counting_dispatch_data_t{});
info->record_callback(
dispatch_data, out.data(), out.size(), session.user_data, info->record_callback_args);
dispatch_data.dispatch_info = session.callback_record.dispatch_info;
dispatch_data.correlation_id = _corr_id_v;
info->record_callback(dispatch_data,
out.data(),
out.size(),
session.user_data,
info->record_callback_args);
}
}
}
@@ -180,7 +180,8 @@ buffered_callback(rocprofiler_context_id_t,
for(size_t i = 0; i < num_headers; ++i)
{
auto* header = headers[i];
if(header->category == ROCPROFILER_BUFFER_CATEGORY_COUNTERS && header->kind == 0)
if(header->category == ROCPROFILER_BUFFER_CATEGORY_COUNTERS &&
header->kind == ROCPROFILER_COUNTER_RECORD_VALUE)
{
// Print the returned counter data.
auto* record = static_cast<rocprofiler_record_counter_t*>(header->payload);
@@ -380,11 +381,11 @@ user_dispatch_cb(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
{
expected_dispatch& expected = *static_cast<expected_dispatch*>(callback_data_args);
auto agent_id = dispatch_data.agent_id;
auto queue_id = dispatch_data.queue_id;
auto agent_id = dispatch_data.dispatch_info.agent_id;
auto queue_id = dispatch_data.dispatch_info.queue_id;
auto correlation_id = dispatch_data.correlation_id;
auto kernel_id = dispatch_data.kernel_id;
auto dispatch_id = dispatch_data.dispatch_id;
auto kernel_id = dispatch_data.dispatch_info.kernel_id;
auto dispatch_id = dispatch_data.dispatch_info.dispatch_id;
EXPECT_EQ(sizeof(rocprofiler_profile_counting_dispatch_data_t), dispatch_data.size);
EXPECT_EQ(expected.kernel_id, kernel_id);
@@ -394,8 +395,8 @@ user_dispatch_cb(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
EXPECT_EQ(expected.correlation_id.internal, correlation_id.internal);
EXPECT_EQ(expected.correlation_id.external.ptr, correlation_id.external.ptr);
EXPECT_EQ(expected.correlation_id.external.value, correlation_id.external.value);
EXPECT_EQ(expected.workgroup_size, dispatch_data.workgroup_size);
EXPECT_EQ(expected.grid_size, dispatch_data.grid_size);
EXPECT_EQ(expected.workgroup_size, dispatch_data.dispatch_info.workgroup_size);
EXPECT_EQ(expected.grid_size, dispatch_data.dispatch_info.grid_size);
ASSERT_NE(config, nullptr);
config->handle = expected.id.handle;
@@ -210,7 +210,7 @@
<metric name="TCC_EA_RDREQ_IO_CREDIT_STALL_sum" expr=reduce(TCC_EA_RDREQ_IO_CREDIT_STALL,sum) descr="Number of cycles there was a stall because the read request interface was out of IO credits. Stalls occur regardless of whether a read needed to be performed or not. Sum over TCC instances."></metric>
<metric name="TCC_EA_RDREQ_GMI_CREDIT_STALL_sum" expr=reduce(TCC_EA_RDREQ_GMI_CREDIT_STALL,sum) descr="Number of cycles there was a stall because the read request interface was out of GMI credits. Stalls occur regardless of whether a read needed to be performed or not. Sum over TCC instances."></metric>
<metric name="TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum" expr=reduce(TCC_EA_RDREQ_DRAM_CREDIT_STALL,sum) descr="Number of cycles there was a stall because the read request interface was out of DRAM credits. Stalls occur regardless of whether a read needed to be performed or not. Sum over TCC instances."></metric>
<metric name="TCC_TAG_STALL_sum" expr=reduce(TCC_TAG_STALL,sum) descr="."></metric>
<metric name="TCC_TAG_STALL_sum" expr=reduce(TCC_TAG_STALL,sum) descr="Total number of cycles the normal request pipeline in the tag is stalled for any reason."></metric>
<metric name="TCC_NORMAL_WRITEBACK_sum" expr=reduce(TCC_NORMAL_WRITEBACK,sum) descr="Number of writebacks due to requests that are not writeback requests. Sum over TCC instances."></metric>
<metric name="TCC_ALL_TC_OP_WB_WRITEBACK_sum" expr=reduce(TCC_ALL_TC_OP_WB_WRITEBACK,sum) descr="Number of writebacks due to all TC_OP writeback requests. Sum over TCC instances."></metric>
<metric name="TCC_NORMAL_EVICT_sum" expr=reduce(TCC_NORMAL_EVICT,sum) descr="Number of evictions due to requests that are not invalidate or probe requests. Sum over TCC instances."></metric>
@@ -410,7 +410,7 @@
<metric name="TCC_EA0_RDREQ_IO_CREDIT_STALL_sum" expr=reduce(TCC_EA0_RDREQ_IO_CREDIT_STALL,sum) descr="Number of cycles there was a stall because the read request interface was out of IO credits. Stalls occur regardless of whether a read needed to be performed or not. Sum over TCC instances."></metric>
<metric name="TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum" expr=reduce(TCC_EA0_RDREQ_GMI_CREDIT_STALL,sum) descr="Number of cycles there was a stall because the read request interface was out of GMI credits. Stalls occur regardless of whether a read needed to be performed or not. Sum over TCC instances."></metric>
<metric name="TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum" expr=reduce(TCC_EA0_RDREQ_DRAM_CREDIT_STALL,sum) descr="Number of cycles there was a stall because the read request interface was out of DRAM credits. Stalls occur regardless of whether a read needed to be performed or not. Sum over TCC instances."></metric>
<metric name="TCC_TAG_STALL_sum" expr=reduce(TCC_TAG_STALL,sum) descr="."></metric>
<metric name="TCC_TAG_STALL_sum" expr=reduce(TCC_TAG_STALL,sum) descr="Total number of cycles the normal request pipeline in the tag is stalled for any reason."></metric>
<metric name="TCC_NORMAL_WRITEBACK_sum" expr=reduce(TCC_NORMAL_WRITEBACK,sum) descr="Number of writebacks due to requests that are not writeback requests. Sum over TCC instances."></metric>
<metric name="TCC_ALL_TC_OP_WB_WRITEBACK_sum" expr=reduce(TCC_ALL_TC_OP_WB_WRITEBACK,sum) descr="Number of writebacks due to all TC_OP writeback requests. Sum over TCC instances."></metric>
<metric name="TCC_NORMAL_EVICT_sum" expr=reduce(TCC_NORMAL_EVICT,sum) descr="Number of evictions due to requests that are not invalidate or probe requests. Sum over TCC instances."></metric>
@@ -275,23 +275,32 @@ WriteInterceptor(const void* packets,
corr_id->add_kern_count();
}
auto dispatch_id = ++sequence_counter;
auto callback_record =
callback_record_t{sizeof(callback_record_t),
rocprofiler_timestamp_t{0},
rocprofiler_timestamp_t{0},
queue.get_agent().get_rocp_agent()->id,
queue.get_id(),
kernel_id,
dispatch_id,
kernel_pkt.kernel_dispatch.private_segment_size,
kernel_pkt.kernel_dispatch.group_segment_size,
rocprofiler_dim3_t{kernel_pkt.kernel_dispatch.workgroup_size_x,
kernel_pkt.kernel_dispatch.workgroup_size_y,
kernel_pkt.kernel_dispatch.workgroup_size_z},
rocprofiler_dim3_t{kernel_pkt.kernel_dispatch.grid_size_x,
kernel_pkt.kernel_dispatch.grid_size_y,
kernel_pkt.kernel_dispatch.grid_size_z}};
// computes the "size" based on the offset of reserved_padding field
constexpr auto kernel_dispatch_info_rt_size =
common::compute_runtime_sizeof<rocprofiler_kernel_dispatch_info_t>();
static_assert(kernel_dispatch_info_rt_size < sizeof(rocprofiler_kernel_dispatch_info_t),
"failed to compute size field based on offset of reserved_padding field");
auto dispatch_id = ++sequence_counter;
auto callback_record = callback_record_t{
sizeof(callback_record_t),
rocprofiler_timestamp_t{0},
rocprofiler_timestamp_t{0},
rocprofiler_kernel_dispatch_info_t{
.size = kernel_dispatch_info_rt_size,
.agent_id = queue.get_agent().get_rocp_agent()->id,
.queue_id = queue.get_id(),
.kernel_id = kernel_id,
.dispatch_id = dispatch_id,
.private_segment_size = kernel_pkt.kernel_dispatch.private_segment_size,
.group_segment_size = kernel_pkt.kernel_dispatch.group_segment_size,
.workgroup_size = rocprofiler_dim3_t{kernel_pkt.kernel_dispatch.workgroup_size_x,
kernel_pkt.kernel_dispatch.workgroup_size_y,
kernel_pkt.kernel_dispatch.workgroup_size_z},
.grid_size = rocprofiler_dim3_t{kernel_pkt.kernel_dispatch.grid_size_x,
kernel_pkt.kernel_dispatch.grid_size_y,
kernel_pkt.kernel_dispatch.grid_size_z}}};
{
auto tracer_data = callback_record;
@@ -66,9 +66,9 @@ dispatch_complete(queue_info_session_t& session)
// only do the following work if there are contexts that require this info
auto& callback_record = session.callback_record;
const auto& _extern_corr_ids = session.tracing_data.external_correlation_ids;
const auto* _rocp_agent = agent::get_agent(callback_record.agent_id);
const auto* _rocp_agent = agent::get_agent(callback_record.dispatch_info.agent_id);
auto _hsa_agent = agent::get_hsa_agent(_rocp_agent);
auto _kern_id = callback_record.kernel_id;
auto _kern_id = callback_record.dispatch_info.kernel_id;
auto _signal = session.kernel_pkt.kernel_dispatch.completion_signal;
auto _tid = session.tid;
@@ -122,14 +122,7 @@ dispatch_complete(queue_info_session_t& session)
_tid,
callback_record.start_timestamp,
callback_record.end_timestamp,
callback_record.agent_id,
callback_record.queue_id,
callback_record.kernel_id,
callback_record.dispatch_id,
callback_record.private_segment_size,
callback_record.group_segment_size,
callback_record.workgroup_size,
callback_record.grid_size};
callback_record.dispatch_info};
tracing::execute_buffer_record_emplace(tracing_data_v.buffered_contexts,
_tid,
@@ -194,10 +194,10 @@ def test_kernel_ids(input_data):
assert payload["kernel_name"] == symbol_info[kern_id]["kernel_name"]
for itr in sdk_data["buffer_records"]["kernel_dispatches"]:
assert itr["kernel_id"] in symbol_info.keys()
assert itr["dispatch_info"]["kernel_id"] in symbol_info.keys()
for itr in sdk_data["callback_records"]["kernel_dispatches"]:
assert itr["payload"]["kernel_id"] in symbol_info.keys()
assert itr["payload"]["dispatch_info"]["kernel_id"] in symbol_info.keys()
def test_kernel_dispatch_ids(input_data):
@@ -211,11 +211,11 @@ def test_kernel_dispatch_ids(input_data):
bf_seq_ids = []
for itr in sdk_data["buffer_records"]["kernel_dispatches"]:
bf_seq_ids.append(itr["dispatch_id"])
bf_seq_ids.append(itr["dispatch_info"]["dispatch_id"])
cb_seq_ids = []
for itr in sdk_data["callback_records"]["kernel_dispatches"]:
cb_seq_ids.append(itr["payload"]["dispatch_id"])
cb_seq_ids.append(itr["payload"]["dispatch_info"]["dispatch_id"])
bf_seq_ids = sorted(bf_seq_ids)
cb_seq_ids = sorted(cb_seq_ids)
@@ -97,6 +97,13 @@ save(ArchiveT& ar, rocprofiler_queue_id_t data)
SAVE_DATA_FIELD(handle);
}
template <typename ArchiveT>
void
save(ArchiveT& ar, rocprofiler_counter_id_t data)
{
SAVE_DATA_FIELD(handle);
}
template <typename ArchiveT>
void
save(ArchiveT& ar, rocprofiler_correlation_id_t data)
@@ -270,11 +277,9 @@ save(ArchiveT& ar, rocprofiler_callback_tracing_scratch_memory_data_t data)
template <typename ArchiveT>
void
save(ArchiveT& ar, rocprofiler_callback_tracing_kernel_dispatch_data_t data)
save(ArchiveT& ar, rocprofiler_kernel_dispatch_info_t data)
{
SAVE_DATA_FIELD(size);
SAVE_DATA_FIELD(start_timestamp);
SAVE_DATA_FIELD(end_timestamp);
SAVE_DATA_FIELD(agent_id);
SAVE_DATA_FIELD(queue_id);
SAVE_DATA_FIELD(kernel_id);
@@ -285,20 +290,33 @@ save(ArchiveT& ar, rocprofiler_callback_tracing_kernel_dispatch_data_t data)
SAVE_DATA_FIELD(group_segment_size);
}
template <typename ArchiveT>
void
save(ArchiveT& ar, rocprofiler_callback_tracing_kernel_dispatch_data_t data)
{
SAVE_DATA_FIELD(size);
SAVE_DATA_FIELD(start_timestamp);
SAVE_DATA_FIELD(end_timestamp);
SAVE_DATA_FIELD(dispatch_info);
}
template <typename ArchiveT>
void
save(ArchiveT& ar, rocprofiler_profile_counting_dispatch_data_t data)
{
SAVE_DATA_FIELD(size);
SAVE_DATA_FIELD(agent_id);
SAVE_DATA_FIELD(queue_id);
SAVE_DATA_FIELD(kernel_id);
SAVE_DATA_FIELD(dispatch_id);
SAVE_DATA_FIELD(correlation_id);
SAVE_DATA_FIELD(private_segment_size);
SAVE_DATA_FIELD(group_segment_size);
SAVE_DATA_FIELD(workgroup_size);
SAVE_DATA_FIELD(grid_size);
SAVE_DATA_FIELD(dispatch_info);
}
template <typename ArchiveT>
void
save(ArchiveT& ar, rocprofiler_profile_counting_dispatch_record_t data)
{
SAVE_DATA_FIELD(size);
SAVE_DATA_FIELD(num_records);
SAVE_DATA_FIELD(correlation_id);
SAVE_DATA_FIELD(dispatch_info);
}
template <typename ArchiveT>
@@ -339,7 +357,7 @@ save(ArchiveT& ar, rocprofiler_record_counter_t data)
{
SAVE_DATA_FIELD(id);
SAVE_DATA_FIELD(counter_value);
SAVE_DATA_FIELD(correlation_id);
SAVE_DATA_FIELD(dispatch_id);
}
template <typename ArchiveT>
@@ -367,14 +385,7 @@ save(ArchiveT& ar, rocprofiler_buffer_tracing_kernel_dispatch_record_t data)
SAVE_DATA_FIELD(correlation_id);
SAVE_DATA_FIELD(start_timestamp);
SAVE_DATA_FIELD(end_timestamp);
SAVE_DATA_FIELD(agent_id);
SAVE_DATA_FIELD(queue_id);
SAVE_DATA_FIELD(kernel_id);
SAVE_DATA_FIELD(dispatch_id);
SAVE_DATA_FIELD(private_segment_size);
SAVE_DATA_FIELD(group_segment_size);
SAVE_DATA_FIELD(workgroup_size);
SAVE_DATA_FIELD(grid_size);
SAVE_DATA_FIELD(dispatch_info);
}
template <typename ArchiveT>
@@ -705,6 +716,19 @@ save(ArchiveT& ar, const rocprofiler_agent_t& data)
generate("caches", data.caches, data.caches_count);
generate("io_links", data.io_links, data.io_links_count);
}
template <typename ArchiveT>
void
save(ArchiveT& ar, rocprofiler_counter_info_v0_t data)
{
SAVE_DATA_FIELD(id);
SAVE_DATA_BITFIELD("is_constant", is_constant);
SAVE_DATA_BITFIELD("is_derived", is_derived);
SAVE_DATA_CSTR(name);
SAVE_DATA_CSTR(description);
SAVE_DATA_CSTR(block);
SAVE_DATA_CSTR(expression);
}
} // namespace cereal
#undef SAVE_DATA_FIELD
@@ -22,24 +22,50 @@ def test_data_structure(input_data):
def test_counter_values(input_data):
data = input_data
agent_data = data["rocprofiler-sdk-json-tool"]["agents"]
counter_data = data["rocprofiler-sdk-json-tool"]["buffer_records"][
"counter_collection"
]
data = input_data["rocprofiler-sdk-json-tool"]
agent_data = data["agents"]
counter_info = data["counter_info"]
counter_data = data["buffer_records"]["counter_collection"]
scaling_factor = 1
for itr in agent_data:
if itr["type"] == 2 and itr["wave_front_size"] > 0:
scaling_factor = 64 / itr["wave_front_size"]
break
for itr in counter_info:
if itr["is_constant"] == 1 and itr["name"] == "size":
continue
assert itr["id"]["handle"] > 0, f"{itr}"
assert itr["is_constant"] in (0, 1), f"{itr}"
assert itr["is_derived"] in (0, 1), f"{itr}"
assert len(itr["name"]) >= 4, f"{itr}"
assert len(itr["description"]) >= 4, f"{itr}"
if itr["is_constant"] == 0:
if itr["is_derived"] == 0:
assert len(itr["block"]) > 0, f"{itr}"
if itr["is_derived"] == 1:
assert len(itr["expression"]) > 0, f"{itr}"
def get_agent(agent_id):
for itr in agent_data:
if itr["id"]["handle"] == agent_id["handle"]:
return itr
return None
def get_scaling_factor(agent_id):
agent = get_agent(agent_id)
assert agent is not None, f"id={agent_id}"
if agent["type"] == 2 and agent["wave_front_size"] > 0:
return 64 / agent["wave_front_size"]
return 0
for itr in counter_data:
value = itr["counter_value"]
if int(round(value, 0)) > 0:
assert int(round(value, 0)) == int(
round(1 * scaling_factor, 0)
), f"agent_data:\n{agent_data}\n\ncounter_data:\n{counter_data}"
assert itr["num_records"] == len(itr["records"]), f"itr={itr}"
agent_id = itr["dispatch_info"]["agent_id"]
agent = get_agent(agent_id)
scaling_factor = get_scaling_factor(agent_id)
assert agent is not None, f"itr={itr}\nagent={agent}"
for ritr in itr["records"]:
value = ritr["counter_value"]
if int(round(value, 0)) > 0:
assert int(round(value, 0)) == int(
round(1 * scaling_factor, 0)
), f"itr={itr}\nagent={agent}"
if __name__ == "__main__":
@@ -254,10 +254,10 @@ def test_kernel_ids(input_data):
assert payload["kernel_name"] == symbol_info[kern_id]["kernel_name"]
for itr in sdk_data["buffer_records"]["kernel_dispatches"]:
assert itr["kernel_id"] in symbol_info.keys()
assert itr["dispatch_info"]["kernel_id"] in symbol_info.keys()
for itr in sdk_data["callback_records"]["kernel_dispatches"]:
assert itr["payload"]["kernel_id"] in symbol_info.keys()
assert itr["payload"]["dispatch_info"]["kernel_id"] in symbol_info.keys()
def test_kernel_dispatch_ids(input_data):
@@ -271,11 +271,11 @@ def test_kernel_dispatch_ids(input_data):
bf_seq_ids = []
for itr in sdk_data["buffer_records"]["kernel_dispatches"]:
bf_seq_ids.append(itr["dispatch_id"])
bf_seq_ids.append(itr["dispatch_info"]["dispatch_id"])
cb_seq_ids = []
for itr in sdk_data["callback_records"]["kernel_dispatches"]:
cb_seq_ids.append(itr["payload"]["dispatch_id"])
cb_seq_ids.append(itr["payload"]["dispatch_info"]["dispatch_id"])
bf_seq_ids = sorted(bf_seq_ids)
cb_seq_ids = sorted(cb_seq_ids)
@@ -209,7 +209,10 @@ def test_kernel_ids(input_data):
assert payload["kernel_name"] == symbol_info[kern_id]["kernel_name"]
for itr in sdk_data["buffer_records"]["kernel_dispatches"]:
assert itr["kernel_id"] in symbol_info.keys()
assert itr["dispatch_info"]["kernel_id"] in symbol_info.keys()
for itr in sdk_data["callback_records"]["kernel_dispatches"]:
assert itr["payload"]["dispatch_info"]["kernel_id"] in symbol_info.keys()
def test_retired_correlation_ids(input_data):
@@ -40,6 +40,8 @@
#include <rocprofiler-sdk/buffer.h>
#include <rocprofiler-sdk/buffer_tracing.h>
#include <rocprofiler-sdk/callback_tracing.h>
#include <rocprofiler-sdk/counters.h>
#include <rocprofiler-sdk/dispatch_profile.h>
#include <rocprofiler-sdk/external_correlation.h>
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/internal_threading.h>
@@ -62,6 +64,7 @@
#include <map>
#include <mutex>
#include <shared_mutex>
#include <stdexcept>
#include <string>
#include <string_view>
#include <thread>
@@ -549,11 +552,59 @@ struct scratch_memory_callback_record_t
}
};
struct profile_counting_record
{
profile_counting_record(rocprofiler_profile_counting_dispatch_record_t hdr)
: header{hdr}
{}
rocprofiler_profile_counting_dispatch_record_t header = {};
std::vector<rocprofiler_record_counter_t> data = {};
profile_counting_record() = default;
~profile_counting_record() = default;
profile_counting_record(const profile_counting_record&) = default;
profile_counting_record(profile_counting_record&&) noexcept = default;
profile_counting_record& operator=(const profile_counting_record&) = default;
profile_counting_record& operator=(profile_counting_record&&) noexcept = default;
template <typename ArchiveT>
void save(ArchiveT& ar) const
{
cereal::save(ar, header);
auto _data = data;
for(auto& itr : _data)
{
auto _counter_id = rocprofiler_counter_id_t{};
ROCPROFILER_CALL(rocprofiler_query_record_counter_id(itr.id, &_counter_id),
"failed to query counter id");
itr.id = _counter_id.handle;
}
ar(cereal::make_nvp("records", _data));
}
void emplace_back(rocprofiler_record_counter_t val)
{
if(*this != val)
throw std::runtime_error{"invalid profile_counting_record::emplace_back(...)"};
data.emplace_back(val);
}
bool operator==(rocprofiler_record_counter_t rhs) const
{
return (header.dispatch_info.dispatch_id == rhs.dispatch_id);
}
bool operator!=(rocprofiler_record_counter_t rhs) const { return !(*this == rhs); }
};
auto counter_info = std::deque<rocprofiler_counter_info_v0_t>{};
auto code_object_records = std::deque<code_object_callback_record_t>{};
auto kernel_symbol_records = std::deque<kernel_symbol_callback_record_t>{};
auto hsa_api_cb_records = std::deque<hsa_api_callback_record_t>{};
auto marker_api_cb_records = std::deque<marker_api_callback_record_t>{};
auto counter_collection_bf_records = std::deque<rocprofiler_record_counter_t>{};
auto counter_collection_bf_records = std::deque<profile_counting_record>{};
auto hip_api_cb_records = std::deque<hip_api_callback_record_t>{};
auto scratch_memory_cb_records = std::deque<scratch_memory_callback_record_t>{};
auto kernel_dispatch_cb_records = std::deque<kernel_dispatch_callback_record_t>{};
@@ -583,7 +634,8 @@ dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
static std::unordered_map<uint64_t, rocprofiler_profile_config_id_t> profile_cache = {};
auto search_cache = [&]() {
if(auto pos = profile_cache.find(dispatch_data.agent_id.handle); pos != profile_cache.end())
if(auto pos = profile_cache.find(dispatch_data.dispatch_info.agent_id.handle);
pos != profile_cache.end())
{
*config = pos->second;
return true;
@@ -610,7 +662,7 @@ dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
// Iterate through the agents and get the counters available on that agent
ROCPROFILER_CALL(rocprofiler_iterate_agent_supported_counters(
dispatch_data.agent_id,
dispatch_data.dispatch_info.agent_id,
[]([[maybe_unused]] rocprofiler_agent_id_t id,
rocprofiler_counter_id_t* counters,
size_t num_counters,
@@ -626,18 +678,30 @@ dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
static_cast<void*>(&gpu_counters)),
"Could not fetch supported counters");
for(auto& counter : gpu_counters)
{
auto info = rocprofiler_counter_info_v0_t{};
ROCPROFILER_CALL(
rocprofiler_query_counter_info(
counter, ROCPROFILER_COUNTER_INFO_VERSION_0, static_cast<void*>(&info)),
"Could not query counter_id");
counter_info.emplace_back(info);
}
std::vector<rocprofiler_counter_id_t> collect_counters;
// Look for the counters contained in counters_to_collect in gpu_counters
for(auto& counter : gpu_counters)
{
rocprofiler_counter_info_v0_t version;
rocprofiler_counter_info_v0_t info;
ROCPROFILER_CALL(
rocprofiler_query_counter_info(
counter, ROCPROFILER_COUNTER_INFO_VERSION_0, static_cast<void*>(&version)),
counter, ROCPROFILER_COUNTER_INFO_VERSION_0, static_cast<void*>(&info)),
"Could not query counter_id");
if(counters_to_collect.count(std::string(version.name)) > 0)
if(counters_to_collect.count(std::string(info.name)) > 0)
{
collect_counters.push_back(counter);
}
@@ -645,12 +709,13 @@ dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
// Create a colleciton profile for the counters
rocprofiler_profile_config_id_t profile;
ROCPROFILER_CALL(
rocprofiler_create_profile_config(
dispatch_data.agent_id, collect_counters.data(), collect_counters.size(), &profile),
"Could not construct profile cfg");
ROCPROFILER_CALL(rocprofiler_create_profile_config(dispatch_data.dispatch_info.agent_id,
collect_counters.data(),
collect_counters.size(),
&profile),
"Could not construct profile cfg");
profile_cache.emplace(dispatch_data.agent_id.handle, profile);
profile_cache.emplace(dispatch_data.dispatch_info.agent_id.handle, profile);
// Return the profile to collect those counters for this dispatch
*config = profile;
}
@@ -865,10 +930,21 @@ tool_tracing_buffered(rocprofiler_context_id_t /*context*/,
"unexpected rocprofiler_record_header_t tracing category kind"};
}
}
else if(header->category == ROCPROFILER_BUFFER_CATEGORY_COUNTERS && header->kind == 0)
else if(header->category == ROCPROFILER_BUFFER_CATEGORY_COUNTERS &&
header->kind == ROCPROFILER_COUNTER_RECORD_PROFILE_COUNTING_DISPATCH_HEADER)
{
auto* profiler_record =
static_cast<rocprofiler_profile_counting_dispatch_record_t*>(header->payload);
counter_collection_bf_records.emplace_back(*profiler_record);
}
else if(header->category == ROCPROFILER_BUFFER_CATEGORY_COUNTERS &&
header->kind == ROCPROFILER_COUNTER_RECORD_VALUE)
{
auto* profiler_record = static_cast<rocprofiler_record_counter_t*>(header->payload);
counter_collection_bf_records.emplace_back(*profiler_record);
if(counter_collection_bf_records.empty())
throw std::runtime_error{
"missing rocprofiler_profile_counting_dispatch_record_t (header)"};
counter_collection_bf_records.back().emplace_back(*profiler_record);
}
else
{
@@ -1427,7 +1503,8 @@ tool_fini(void* tool_data)
<< ", hip_api_bf_records=" << hip_api_bf_records.size()
<< ", marker_api_bf_records=" << marker_api_bf_records.size()
<< ", corr_id_retire_records=" << corr_id_retire_records.size()
<< ", counter_collection_records=" << counter_collection_bf_records.size() << "...\n"
<< ", counter_collection_value_records=" << counter_collection_bf_records.size()
<< "...\n"
<< std::flush;
auto* _call_stack = static_cast<call_stack_t*>(tool_data);
@@ -1501,6 +1578,7 @@ write_json(call_stack_t* _call_stack)
json_ar.finishNode();
json_ar(cereal::make_nvp("agents", agents));
json_ar(cereal::make_nvp("counter_info", counter_info));
if(_call_stack) json_ar(cereal::make_nvp("call_stack", *_call_stack));
json_ar.setNextName("callback_records");
@@ -1611,7 +1689,8 @@ write_perfetto()
}
for(auto itr : kernel_dispatch_bf_records)
agent_queue_ids[itr.agent_id.handle].emplace(itr.queue_id.handle);
agent_queue_ids[itr.dispatch_info.agent_id.handle].emplace(
itr.dispatch_info.queue_id.handle);
}
auto thread_tracks = std::unordered_map<rocprofiler_thread_id_t, ::perfetto::Track>{};
@@ -1804,10 +1883,11 @@ write_perfetto()
auto demangled = std::unordered_map<std::string_view, std::string>{};
for(auto itr : kernel_dispatch_bf_records)
{
const kernel_symbol_callback_record_t* sym = nullptr;
const auto& info = itr.dispatch_info;
const kernel_symbol_callback_record_t* sym = nullptr;
for(const auto& kitr : kernel_symbol_records)
{
if(kitr.payload.kernel_id == itr.kernel_id)
if(kitr.payload.kernel_id == info.kernel_id)
{
sym = &kitr;
break;
@@ -1815,7 +1895,7 @@ write_perfetto()
}
auto name = std::string_view{sym->payload.kernel_name};
auto& track = agent_queue_tracks.at(itr.agent_id.handle).at(itr.queue_id.handle);
auto& track = agent_queue_tracks.at(info.agent_id.handle).at(info.queue_id.handle);
if(demangled.find(name) == demangled.end())
{
@@ -1833,21 +1913,21 @@ write_perfetto()
"kind",
itr.kind,
"agent",
itr.agent_id.handle,
info.agent_id.handle,
"corr_id",
itr.correlation_id.internal,
"queue",
itr.queue_id.handle,
info.queue_id.handle,
"kernel_id",
itr.kernel_id,
info.kernel_id,
"private_segment_size",
itr.private_segment_size,
info.private_segment_size,
"group_segment_size",
itr.group_segment_size,
info.group_segment_size,
"workgroup_size",
itr.workgroup_size.x * itr.workgroup_size.y * itr.workgroup_size.z,
info.workgroup_size.x * info.workgroup_size.y * info.workgroup_size.z,
"grid_size",
itr.grid_size.x * itr.grid_size.y * itr.grid_size.z);
info.grid_size.x * info.grid_size.y * info.grid_size.z);
TRACE_EVENT_END(rocprofiler::trait::name<rocprofiler::category::kernel_dispatch>::value,
track,