From 1ba08cd4dfb7fe99a51765019210947dfcd199f7 Mon Sep 17 00:00:00 2001 From: "Baraldi, Giovanni" Date: Tue, 29 Jul 2025 22:47:40 +0200 Subject: [PATCH] Removing ATT buffer size limitation (#534) * Removing SQTT buffer size limitation * Update source/lib/rocprofiler-sdk/thread_trace/core.cpp * Added testing for buffer size. Formatting. * Add test as unstable * Increase default buffer size * Apply suggestions from code review Co-authored-by: Indic, Vladimir * Fix typo from code review * Update tests/thread-trace/agent.cpp --------- Co-authored-by: Giovanni Co-authored-by: Indic, Vladimir --- source/bin/rocprofv3.py | 2 +- source/lib/rocprofiler-sdk-tool/config.hpp | 3 +- .../lib/rocprofiler-sdk/aql/aql_profile_v2.h | 8 ++++++ .../rocprofiler-sdk/aql/packet_construct.cpp | 8 ++++-- .../lib/rocprofiler-sdk/thread_trace/core.cpp | 9 +++--- tests/thread-trace/CMakeLists.txt | 17 +++++++++++ tests/thread-trace/agent.cpp | 28 ++++++++++--------- tests/thread-trace/trace_callbacks.hpp | 1 - 8 files changed, 53 insertions(+), 23 deletions(-) diff --git a/source/bin/rocprofv3.py b/source/bin/rocprofv3.py index 571be911ba..1b2acd5dbc 100755 --- a/source/bin/rocprofv3.py +++ b/source/bin/rocprofv3.py @@ -769,7 +769,7 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins att_options.add_argument( "--att-buffer-size", - help="Thread trace buffer size. Default 96MB", + help="Thread trace buffer size. Default 256MB", default=None, type=str, ) diff --git a/source/lib/rocprofiler-sdk-tool/config.hpp b/source/lib/rocprofiler-sdk-tool/config.hpp index a253439542..7da04a9a9c 100644 --- a/source/lib/rocprofiler-sdk-tool/config.hpp +++ b/source/lib/rocprofiler-sdk-tool/config.hpp @@ -139,7 +139,8 @@ struct config : output_config int mpi_rank = get_mpi_rank(); uint64_t att_param_shader_engine_mask = get_env("ROCPROF_ATT_PARAM_SHADER_ENGINE_MASK", 0x1); - uint64_t att_param_buffer_size = get_env("ROCPROF_ATT_PARAM_BUFFER_SIZE", 0x6000000); + // 256MB + uint64_t att_param_buffer_size = get_env("ROCPROF_ATT_PARAM_BUFFER_SIZE", 0x10000000); uint64_t att_param_simd_select = get_env("ROCPROF_ATT_PARAM_SIMD_SELECT", 0xF); uint64_t att_param_target_cu = get_env("ROCPROF_ATT_PARAM_TARGET_CU", 1); uint64_t att_param_perf_ctrl = get_env("ROCPROF_ATT_PARAM_PERFCOUNTER_CTRL", 0); diff --git a/source/lib/rocprofiler-sdk/aql/aql_profile_v2.h b/source/lib/rocprofiler-sdk/aql/aql_profile_v2.h index 3dcacbb3df..4771f1f204 100644 --- a/source/lib/rocprofiler-sdk/aql/aql_profile_v2.h +++ b/source/lib/rocprofiler-sdk/aql/aql_profile_v2.h @@ -184,6 +184,14 @@ aqlprofile_get_pmc_info(const aqlprofile_pmc_profile_t* profile, aqlprofile_pmc_info_type_t attribute, void* value); +typedef enum aqlprofile_att_parameter_name_ext_t +{ + /** + * HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE + 1 + */ + AQLPROFILE_ATT_PARAMETER_NAME_BUFFER_SIZE_HIGH = 11, +} aqlprofile_att_parameter_name_ext_t; + // Profile parameter object typedef struct { diff --git a/source/lib/rocprofiler-sdk/aql/packet_construct.cpp b/source/lib/rocprofiler-sdk/aql/packet_construct.cpp index 4c739adb83..6425bd3884 100644 --- a/source/lib/rocprofiler-sdk/aql/packet_construct.cpp +++ b/source/lib/rocprofiler-sdk/aql/packet_construct.cpp @@ -145,7 +145,8 @@ ThreadTraceAQLPacketFactory::ThreadTraceAQLPacketFactory(const hsa::AgentCache& uint32_t cu = static_cast(params.target_cu); uint32_t shader_engine_mask = static_cast(params.shader_engine_mask); uint32_t simd = static_cast(params.simd_select); - uint32_t buffer_size = static_cast(params.buffer_size); + uint32_t buffer_size_lo = static_cast(params.buffer_size); + uint32_t buffer_size_hi = static_cast(params.buffer_size >> 32); uint32_t perf_ctrl = static_cast(params.perfcounter_ctrl); aql_params.clear(); @@ -153,7 +154,10 @@ ThreadTraceAQLPacketFactory::ThreadTraceAQLPacketFactory(const hsa::AgentCache& aql_params.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET, {cu}}); aql_params.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK, {shader_engine_mask}}); aql_params.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SIMD_SELECTION, {simd}}); - aql_params.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE, {buffer_size}}); + aql_params.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE, {buffer_size_lo}}); + + if(buffer_size_hi != 0) aql_params.push_back({static_cast( + AQLPROFILE_ATT_PARAMETER_NAME_BUFFER_SIZE_HIGH), {buffer_size_hi}}); if(perf_ctrl != 0 && !params.perfcounters.empty()) { diff --git a/source/lib/rocprofiler-sdk/thread_trace/core.cpp b/source/lib/rocprofiler-sdk/thread_trace/core.cpp index 16360ad5b2..8f50e08f00 100644 --- a/source/lib/rocprofiler-sdk/thread_trace/core.cpp +++ b/source/lib/rocprofiler-sdk/thread_trace/core.cpp @@ -57,8 +57,7 @@ namespace rocprofiler namespace thread_trace { constexpr size_t QUEUE_SIZE = 128; -constexpr uint64_t MIN_BUFFER_SIZE = 1 << 18; // 2 pages per SE -constexpr uint64_t MAX_BUFFER_SIZE = std::numeric_limits::max(); // aqlprofile limit +constexpr uint64_t MIN_BUFFER_SIZE = 1 << 20; // 1MB struct cbdata_t { @@ -74,15 +73,15 @@ thread_trace_parameter_pack::are_params_valid() const { if(shader_cb_fn == nullptr) { - ROCP_WARNING << "Callback cannot be null!"; + ROCP_CI_LOG(WARNING) << "Callback cannot be null!"; return false; } if(shader_engine_mask == 0) return false; - if(buffer_size > MAX_BUFFER_SIZE || buffer_size < MIN_BUFFER_SIZE) + if(buffer_size < MIN_BUFFER_SIZE) { - ROCP_WARNING << "Invalid buffer size: " << buffer_size; + ROCP_CI_LOG(WARNING) << "Invalid buffer size: " << buffer_size; return false; } diff --git a/tests/thread-trace/CMakeLists.txt b/tests/thread-trace/CMakeLists.txt index 6bf784e5ba..4c43dc9f67 100644 --- a/tests/thread-trace/CMakeLists.txt +++ b/tests/thread-trace/CMakeLists.txt @@ -97,3 +97,20 @@ set_tests_properties( PROPERTIES TIMEOUT 10 LABELS "integration-tests" ENVIRONMENT "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}" FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") + +# Test large buffer sizes. 5120 == 5GB +add_test(NAME thread-trace-api-large-buffer-test + COMMAND $) + +set_tests_properties( + thread-trace-api-large-buffer-test + PROPERTIES TIMEOUT + 10 + LABELS + "integration-tests" + ENVIRONMENT + "${PRELOAD_ENV};ATT_BUFFER_SIZE_MB=5120" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + DISABLED + ${ROCPROFILER_DISABLE_UNSTABLE_CTESTS}) diff --git a/tests/thread-trace/agent.cpp b/tests/thread-trace/agent.cpp index 708ba44968..ccc2004995 100644 --- a/tests/thread-trace/agent.cpp +++ b/tests/thread-trace/agent.cpp @@ -47,19 +47,16 @@ dispatch_tracing_callback(rocprofiler_callback_tracing_record_t record, assert(record.payload); auto* rdata = static_cast(record.payload); - int dispatch_id = (int) rdata->dispatch_info.dispatch_id; + auto dispatch_id = rdata->dispatch_info.dispatch_id; - auto get_int_var = [](const char* var_name, int def) { - const char* var = getenv(var_name); - if(var) return atoi(var); - return def; - }; - static int begin_dispatch = get_int_var("ROCPROFILER_THREAD_TRACE_BEGIN", 1); - static int end_dispatch = get_int_var("ROCPROFILER_THREAD_TRACE_END", 4); + // Choose two dispatches to begin(6) and end(10) the trace + constexpr uint64_t begin_dispatch = 6; + constexpr uint64_t end_dispatch = 10; static std::atomic isprofiling{false}; + static std::atomic stop_profiling{false}; - static std::mutex mut; - static std::set captured_ids; + static std::mutex mut{}; + static std::set captured_ids{}; if(record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER) { @@ -73,6 +70,7 @@ dispatch_tracing_callback(rocprofiler_callback_tracing_record_t record, std::unique_lock lk(mut); captured_ids.insert(dispatch_id); } + if(dispatch_id > end_dispatch) stop_profiling.store(true); return; } @@ -82,7 +80,7 @@ dispatch_tracing_callback(rocprofiler_callback_tracing_record_t record, std::unique_lock lk(mut); captured_ids.erase(dispatch_id); - if(!captured_ids.empty()) return; + if(!captured_ids.empty() || stop_profiling == false) return; bool _exp = true; if(!isprofiling.compare_exchange_strong(_exp, false, std::memory_order_relaxed)) return; @@ -104,11 +102,15 @@ query_available_agents(rocprofiler_agent_version_t /* version */, const auto* agent = static_cast(agents[idx]); if(agent->type != ROCPROFILER_AGENT_TYPE_GPU) continue; + // Check if we are testing for large buffers + static const char* var = getenv("ATT_BUFFER_SIZE_MB"); + static uint64_t buffer_size_mb = (var ? atoi(var) : 96) * 1024ul * 1024ul; + std::vector parameters; parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_TARGET_CU, 1}); parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_SIMD_SELECT, 0xF}); - parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_BUFFER_SIZE, 0x6000000}); - parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_SHADER_ENGINE_MASK, 0x11}); + parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_BUFFER_SIZE, buffer_size_mb}); + parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_SHADER_ENGINE_MASK, 0x1}); parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_SERIALIZE_ALL, 0}); ROCPROFILER_CALL( diff --git a/tests/thread-trace/trace_callbacks.hpp b/tests/thread-trace/trace_callbacks.hpp index 9c175782b0..32afefe053 100644 --- a/tests/thread-trace/trace_callbacks.hpp +++ b/tests/thread-trace/trace_callbacks.hpp @@ -30,7 +30,6 @@ #include #include -#include #include #include #include