Removing ATT buffer size limitation (#534)

* Removing SQTT buffer size limitation

* Update source/lib/rocprofiler-sdk/thread_trace/core.cpp

* Added testing for buffer size. Formatting.

* Add test as unstable

* Increase default buffer size

* Apply suggestions from code review

Co-authored-by: Indic, Vladimir <Vladimir.Indic@amd.com>

* Fix typo from code review

* Update tests/thread-trace/agent.cpp

---------

Co-authored-by: Giovanni <gbaraldi@amd.com>
Co-authored-by: Indic, Vladimir <Vladimir.Indic@amd.com>
This commit is contained in:
Baraldi, Giovanni
2025-07-29 22:47:40 +02:00
committed by GitHub
parent 2d8936362e
commit 1ba08cd4df
8 changed files with 53 additions and 23 deletions
+1 -1
View File
@@ -769,7 +769,7 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins
att_options.add_argument(
"--att-buffer-size",
help="Thread trace buffer size. Default 96MB",
help="Thread trace buffer size. Default 256MB",
default=None,
type=str,
)
+2 -1
View File
@@ -139,7 +139,8 @@ struct config : output_config
int mpi_rank = get_mpi_rank();
uint64_t att_param_shader_engine_mask =
get_env<uint64_t>("ROCPROF_ATT_PARAM_SHADER_ENGINE_MASK", 0x1);
uint64_t att_param_buffer_size = get_env<uint64_t>("ROCPROF_ATT_PARAM_BUFFER_SIZE", 0x6000000);
// 256MB
uint64_t att_param_buffer_size = get_env<uint64_t>("ROCPROF_ATT_PARAM_BUFFER_SIZE", 0x10000000);
uint64_t att_param_simd_select = get_env<uint64_t>("ROCPROF_ATT_PARAM_SIMD_SELECT", 0xF);
uint64_t att_param_target_cu = get_env<uint64_t>("ROCPROF_ATT_PARAM_TARGET_CU", 1);
uint64_t att_param_perf_ctrl = get_env<uint64_t>("ROCPROF_ATT_PARAM_PERFCOUNTER_CTRL", 0);
@@ -184,6 +184,14 @@ aqlprofile_get_pmc_info(const aqlprofile_pmc_profile_t* profile,
aqlprofile_pmc_info_type_t attribute,
void* value);
typedef enum aqlprofile_att_parameter_name_ext_t
{
/**
* HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE + 1
*/
AQLPROFILE_ATT_PARAMETER_NAME_BUFFER_SIZE_HIGH = 11,
} aqlprofile_att_parameter_name_ext_t;
// Profile parameter object
typedef struct
{
@@ -145,7 +145,8 @@ ThreadTraceAQLPacketFactory::ThreadTraceAQLPacketFactory(const hsa::AgentCache&
uint32_t cu = static_cast<uint32_t>(params.target_cu);
uint32_t shader_engine_mask = static_cast<uint32_t>(params.shader_engine_mask);
uint32_t simd = static_cast<uint32_t>(params.simd_select);
uint32_t buffer_size = static_cast<uint32_t>(params.buffer_size);
uint32_t buffer_size_lo = static_cast<uint32_t>(params.buffer_size);
uint32_t buffer_size_hi = static_cast<uint32_t>(params.buffer_size >> 32);
uint32_t perf_ctrl = static_cast<uint32_t>(params.perfcounter_ctrl);
aql_params.clear();
@@ -153,7 +154,10 @@ ThreadTraceAQLPacketFactory::ThreadTraceAQLPacketFactory(const hsa::AgentCache&
aql_params.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET, {cu}});
aql_params.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK, {shader_engine_mask}});
aql_params.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SIMD_SELECTION, {simd}});
aql_params.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE, {buffer_size}});
aql_params.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE, {buffer_size_lo}});
if(buffer_size_hi != 0) aql_params.push_back({static_cast<hsa_ven_amd_aqlprofile_parameter_name_t>(
AQLPROFILE_ATT_PARAMETER_NAME_BUFFER_SIZE_HIGH), {buffer_size_hi}});
if(perf_ctrl != 0 && !params.perfcounters.empty())
{
@@ -57,8 +57,7 @@ namespace rocprofiler
namespace thread_trace
{
constexpr size_t QUEUE_SIZE = 128;
constexpr uint64_t MIN_BUFFER_SIZE = 1 << 18; // 2 pages per SE
constexpr uint64_t MAX_BUFFER_SIZE = std::numeric_limits<int32_t>::max(); // aqlprofile limit
constexpr uint64_t MIN_BUFFER_SIZE = 1 << 20; // 1MB
struct cbdata_t
{
@@ -74,15 +73,15 @@ thread_trace_parameter_pack::are_params_valid() const
{
if(shader_cb_fn == nullptr)
{
ROCP_WARNING << "Callback cannot be null!";
ROCP_CI_LOG(WARNING) << "Callback cannot be null!";
return false;
}
if(shader_engine_mask == 0) return false;
if(buffer_size > MAX_BUFFER_SIZE || buffer_size < MIN_BUFFER_SIZE)
if(buffer_size < MIN_BUFFER_SIZE)
{
ROCP_WARNING << "Invalid buffer size: " << buffer_size;
ROCP_CI_LOG(WARNING) << "Invalid buffer size: " << buffer_size;
return false;
}
+17
View File
@@ -97,3 +97,20 @@ set_tests_properties(
PROPERTIES TIMEOUT 10 LABELS "integration-tests" ENVIRONMENT
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV}" FAIL_REGULAR_EXPRESSION
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
# Test large buffer sizes. 5120 == 5GB
add_test(NAME thread-trace-api-large-buffer-test
COMMAND $<TARGET_FILE:thread-trace-api-agent-test>)
set_tests_properties(
thread-trace-api-large-buffer-test
PROPERTIES TIMEOUT
10
LABELS
"integration-tests"
ENVIRONMENT
"${PRELOAD_ENV};ATT_BUFFER_SIZE_MB=5120"
FAIL_REGULAR_EXPRESSION
"${ROCPROFILER_DEFAULT_FAIL_REGEX}"
DISABLED
${ROCPROFILER_DISABLE_UNSTABLE_CTESTS})
+15 -13
View File
@@ -47,19 +47,16 @@ dispatch_tracing_callback(rocprofiler_callback_tracing_record_t record,
assert(record.payload);
auto* rdata = static_cast<rocprofiler_callback_tracing_kernel_dispatch_data_t*>(record.payload);
int dispatch_id = (int) rdata->dispatch_info.dispatch_id;
auto dispatch_id = rdata->dispatch_info.dispatch_id;
auto get_int_var = [](const char* var_name, int def) {
const char* var = getenv(var_name);
if(var) return atoi(var);
return def;
};
static int begin_dispatch = get_int_var("ROCPROFILER_THREAD_TRACE_BEGIN", 1);
static int end_dispatch = get_int_var("ROCPROFILER_THREAD_TRACE_END", 4);
// Choose two dispatches to begin(6) and end(10) the trace
constexpr uint64_t begin_dispatch = 6;
constexpr uint64_t end_dispatch = 10;
static std::atomic<bool> isprofiling{false};
static std::atomic<bool> stop_profiling{false};
static std::mutex mut;
static std::set<int> captured_ids;
static std::mutex mut{};
static std::set<int> captured_ids{};
if(record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER)
{
@@ -73,6 +70,7 @@ dispatch_tracing_callback(rocprofiler_callback_tracing_record_t record,
std::unique_lock<std::mutex> lk(mut);
captured_ids.insert(dispatch_id);
}
if(dispatch_id > end_dispatch) stop_profiling.store(true);
return;
}
@@ -82,7 +80,7 @@ dispatch_tracing_callback(rocprofiler_callback_tracing_record_t record,
std::unique_lock<std::mutex> lk(mut);
captured_ids.erase(dispatch_id);
if(!captured_ids.empty()) return;
if(!captured_ids.empty() || stop_profiling == false) return;
bool _exp = true;
if(!isprofiling.compare_exchange_strong(_exp, false, std::memory_order_relaxed)) return;
@@ -104,11 +102,15 @@ query_available_agents(rocprofiler_agent_version_t /* version */,
const auto* agent = static_cast<const rocprofiler_agent_v0_t*>(agents[idx]);
if(agent->type != ROCPROFILER_AGENT_TYPE_GPU) continue;
// Check if we are testing for large buffers
static const char* var = getenv("ATT_BUFFER_SIZE_MB");
static uint64_t buffer_size_mb = (var ? atoi(var) : 96) * 1024ul * 1024ul;
std::vector<rocprofiler_thread_trace_parameter_t> parameters;
parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_TARGET_CU, 1});
parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_SIMD_SELECT, 0xF});
parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_BUFFER_SIZE, 0x6000000});
parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_SHADER_ENGINE_MASK, 0x11});
parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_BUFFER_SIZE, buffer_size_mb});
parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_SHADER_ENGINE_MASK, 0x1});
parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_SERIALIZE_ALL, 0});
ROCPROFILER_CALL(
-1
View File
@@ -30,7 +30,6 @@
#include <rocprofiler-sdk/rocprofiler.h>
#include <atomic>
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <iostream>