Removing ATT buffer size limitation (#534)
* Removing SQTT buffer size limitation * Update source/lib/rocprofiler-sdk/thread_trace/core.cpp * Added testing for buffer size. Formatting. * Add test as unstable * Increase default buffer size * Apply suggestions from code review Co-authored-by: Indic, Vladimir <Vladimir.Indic@amd.com> * Fix typo from code review * Update tests/thread-trace/agent.cpp --------- Co-authored-by: Giovanni <gbaraldi@amd.com> Co-authored-by: Indic, Vladimir <Vladimir.Indic@amd.com>
This commit is contained in:
committed by
GitHub
parent
2d8936362e
commit
1ba08cd4df
@@ -769,7 +769,7 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins
|
||||
|
||||
att_options.add_argument(
|
||||
"--att-buffer-size",
|
||||
help="Thread trace buffer size. Default 96MB",
|
||||
help="Thread trace buffer size. Default 256MB",
|
||||
default=None,
|
||||
type=str,
|
||||
)
|
||||
|
||||
@@ -139,7 +139,8 @@ struct config : output_config
|
||||
int mpi_rank = get_mpi_rank();
|
||||
uint64_t att_param_shader_engine_mask =
|
||||
get_env<uint64_t>("ROCPROF_ATT_PARAM_SHADER_ENGINE_MASK", 0x1);
|
||||
uint64_t att_param_buffer_size = get_env<uint64_t>("ROCPROF_ATT_PARAM_BUFFER_SIZE", 0x6000000);
|
||||
// 256MB
|
||||
uint64_t att_param_buffer_size = get_env<uint64_t>("ROCPROF_ATT_PARAM_BUFFER_SIZE", 0x10000000);
|
||||
uint64_t att_param_simd_select = get_env<uint64_t>("ROCPROF_ATT_PARAM_SIMD_SELECT", 0xF);
|
||||
uint64_t att_param_target_cu = get_env<uint64_t>("ROCPROF_ATT_PARAM_TARGET_CU", 1);
|
||||
uint64_t att_param_perf_ctrl = get_env<uint64_t>("ROCPROF_ATT_PARAM_PERFCOUNTER_CTRL", 0);
|
||||
|
||||
@@ -184,6 +184,14 @@ aqlprofile_get_pmc_info(const aqlprofile_pmc_profile_t* profile,
|
||||
aqlprofile_pmc_info_type_t attribute,
|
||||
void* value);
|
||||
|
||||
typedef enum aqlprofile_att_parameter_name_ext_t
|
||||
{
|
||||
/**
|
||||
* HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE + 1
|
||||
*/
|
||||
AQLPROFILE_ATT_PARAMETER_NAME_BUFFER_SIZE_HIGH = 11,
|
||||
} aqlprofile_att_parameter_name_ext_t;
|
||||
|
||||
// Profile parameter object
|
||||
typedef struct
|
||||
{
|
||||
|
||||
@@ -145,7 +145,8 @@ ThreadTraceAQLPacketFactory::ThreadTraceAQLPacketFactory(const hsa::AgentCache&
|
||||
uint32_t cu = static_cast<uint32_t>(params.target_cu);
|
||||
uint32_t shader_engine_mask = static_cast<uint32_t>(params.shader_engine_mask);
|
||||
uint32_t simd = static_cast<uint32_t>(params.simd_select);
|
||||
uint32_t buffer_size = static_cast<uint32_t>(params.buffer_size);
|
||||
uint32_t buffer_size_lo = static_cast<uint32_t>(params.buffer_size);
|
||||
uint32_t buffer_size_hi = static_cast<uint32_t>(params.buffer_size >> 32);
|
||||
uint32_t perf_ctrl = static_cast<uint32_t>(params.perfcounter_ctrl);
|
||||
|
||||
aql_params.clear();
|
||||
@@ -153,7 +154,10 @@ ThreadTraceAQLPacketFactory::ThreadTraceAQLPacketFactory(const hsa::AgentCache&
|
||||
aql_params.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET, {cu}});
|
||||
aql_params.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK, {shader_engine_mask}});
|
||||
aql_params.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SIMD_SELECTION, {simd}});
|
||||
aql_params.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE, {buffer_size}});
|
||||
aql_params.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE, {buffer_size_lo}});
|
||||
|
||||
if(buffer_size_hi != 0) aql_params.push_back({static_cast<hsa_ven_amd_aqlprofile_parameter_name_t>(
|
||||
AQLPROFILE_ATT_PARAMETER_NAME_BUFFER_SIZE_HIGH), {buffer_size_hi}});
|
||||
|
||||
if(perf_ctrl != 0 && !params.perfcounters.empty())
|
||||
{
|
||||
|
||||
@@ -57,8 +57,7 @@ namespace rocprofiler
|
||||
namespace thread_trace
|
||||
{
|
||||
constexpr size_t QUEUE_SIZE = 128;
|
||||
constexpr uint64_t MIN_BUFFER_SIZE = 1 << 18; // 2 pages per SE
|
||||
constexpr uint64_t MAX_BUFFER_SIZE = std::numeric_limits<int32_t>::max(); // aqlprofile limit
|
||||
constexpr uint64_t MIN_BUFFER_SIZE = 1 << 20; // 1MB
|
||||
|
||||
struct cbdata_t
|
||||
{
|
||||
@@ -74,15 +73,15 @@ thread_trace_parameter_pack::are_params_valid() const
|
||||
{
|
||||
if(shader_cb_fn == nullptr)
|
||||
{
|
||||
ROCP_WARNING << "Callback cannot be null!";
|
||||
ROCP_CI_LOG(WARNING) << "Callback cannot be null!";
|
||||
return false;
|
||||
}
|
||||
|
||||
if(shader_engine_mask == 0) return false;
|
||||
|
||||
if(buffer_size > MAX_BUFFER_SIZE || buffer_size < MIN_BUFFER_SIZE)
|
||||
if(buffer_size < MIN_BUFFER_SIZE)
|
||||
{
|
||||
ROCP_WARNING << "Invalid buffer size: " << buffer_size;
|
||||
ROCP_CI_LOG(WARNING) << "Invalid buffer size: " << buffer_size;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@@ -97,3 +97,20 @@ set_tests_properties(
|
||||
PROPERTIES TIMEOUT 10 LABELS "integration-tests" ENVIRONMENT
|
||||
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV}" FAIL_REGULAR_EXPRESSION
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
|
||||
|
||||
# Test large buffer sizes. 5120 == 5GB
|
||||
add_test(NAME thread-trace-api-large-buffer-test
|
||||
COMMAND $<TARGET_FILE:thread-trace-api-agent-test>)
|
||||
|
||||
set_tests_properties(
|
||||
thread-trace-api-large-buffer-test
|
||||
PROPERTIES TIMEOUT
|
||||
10
|
||||
LABELS
|
||||
"integration-tests"
|
||||
ENVIRONMENT
|
||||
"${PRELOAD_ENV};ATT_BUFFER_SIZE_MB=5120"
|
||||
FAIL_REGULAR_EXPRESSION
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}"
|
||||
DISABLED
|
||||
${ROCPROFILER_DISABLE_UNSTABLE_CTESTS})
|
||||
|
||||
@@ -47,19 +47,16 @@ dispatch_tracing_callback(rocprofiler_callback_tracing_record_t record,
|
||||
|
||||
assert(record.payload);
|
||||
auto* rdata = static_cast<rocprofiler_callback_tracing_kernel_dispatch_data_t*>(record.payload);
|
||||
int dispatch_id = (int) rdata->dispatch_info.dispatch_id;
|
||||
auto dispatch_id = rdata->dispatch_info.dispatch_id;
|
||||
|
||||
auto get_int_var = [](const char* var_name, int def) {
|
||||
const char* var = getenv(var_name);
|
||||
if(var) return atoi(var);
|
||||
return def;
|
||||
};
|
||||
static int begin_dispatch = get_int_var("ROCPROFILER_THREAD_TRACE_BEGIN", 1);
|
||||
static int end_dispatch = get_int_var("ROCPROFILER_THREAD_TRACE_END", 4);
|
||||
// Choose two dispatches to begin(6) and end(10) the trace
|
||||
constexpr uint64_t begin_dispatch = 6;
|
||||
constexpr uint64_t end_dispatch = 10;
|
||||
static std::atomic<bool> isprofiling{false};
|
||||
static std::atomic<bool> stop_profiling{false};
|
||||
|
||||
static std::mutex mut;
|
||||
static std::set<int> captured_ids;
|
||||
static std::mutex mut{};
|
||||
static std::set<int> captured_ids{};
|
||||
|
||||
if(record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER)
|
||||
{
|
||||
@@ -73,6 +70,7 @@ dispatch_tracing_callback(rocprofiler_callback_tracing_record_t record,
|
||||
std::unique_lock<std::mutex> lk(mut);
|
||||
captured_ids.insert(dispatch_id);
|
||||
}
|
||||
if(dispatch_id > end_dispatch) stop_profiling.store(true);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -82,7 +80,7 @@ dispatch_tracing_callback(rocprofiler_callback_tracing_record_t record,
|
||||
|
||||
std::unique_lock<std::mutex> lk(mut);
|
||||
captured_ids.erase(dispatch_id);
|
||||
if(!captured_ids.empty()) return;
|
||||
if(!captured_ids.empty() || stop_profiling == false) return;
|
||||
|
||||
bool _exp = true;
|
||||
if(!isprofiling.compare_exchange_strong(_exp, false, std::memory_order_relaxed)) return;
|
||||
@@ -104,11 +102,15 @@ query_available_agents(rocprofiler_agent_version_t /* version */,
|
||||
const auto* agent = static_cast<const rocprofiler_agent_v0_t*>(agents[idx]);
|
||||
if(agent->type != ROCPROFILER_AGENT_TYPE_GPU) continue;
|
||||
|
||||
// Check if we are testing for large buffers
|
||||
static const char* var = getenv("ATT_BUFFER_SIZE_MB");
|
||||
static uint64_t buffer_size_mb = (var ? atoi(var) : 96) * 1024ul * 1024ul;
|
||||
|
||||
std::vector<rocprofiler_thread_trace_parameter_t> parameters;
|
||||
parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_TARGET_CU, 1});
|
||||
parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_SIMD_SELECT, 0xF});
|
||||
parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_BUFFER_SIZE, 0x6000000});
|
||||
parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_SHADER_ENGINE_MASK, 0x11});
|
||||
parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_BUFFER_SIZE, buffer_size_mb});
|
||||
parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_SHADER_ENGINE_MASK, 0x1});
|
||||
parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_SERIALIZE_ALL, 0});
|
||||
|
||||
ROCPROFILER_CALL(
|
||||
|
||||
@@ -30,7 +30,6 @@
|
||||
#include <rocprofiler-sdk/rocprofiler.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
|
||||
Reference in New Issue
Block a user