abd6029603
* Initial consecutive kernel WIP * Updated logic after discussion, create context only when needed, change set of captured ids to dispatch_id_t type * Updated to fix concurrency issues and revert kernel_iterations * Add captured id in first lock capture * Updated code to use wlock, added comments, removed some unecessary atomic * Cleaned up, need to add test * Add test to check that generated stats csv file is not empty * Updated test to check if vector-ops kernels are being used * Fix phase bug * Updated for comments * Flattened ATT logic a bit * Fix incorrect if-statement * Fix merge conflict
334 行
13 KiB
C++
334 行
13 KiB
C++
// MIT License
|
|
//
|
|
// Copyright (c) 2023-2025 Advanced Micro Devices, Inc.
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files (the "Software"), to deal
|
|
// in the Software without restriction, including without limitation the rights
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions:
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in
|
|
// all copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
// THE SOFTWARE.
|
|
//
|
|
|
|
#pragma once
|
|
|
|
#include "lib/att-tool/att_lib_wrapper.hpp"
|
|
#include "lib/common/environment.hpp"
|
|
#include "lib/common/filesystem.hpp"
|
|
#include "lib/common/mpl.hpp"
|
|
#include "lib/common/units.hpp"
|
|
#include "lib/output/format_path.hpp"
|
|
#include "lib/output/output_config.hpp"
|
|
|
|
#include <rocprofiler-sdk/cxx/serialization.hpp>
|
|
|
|
#include <fmt/format.h>
|
|
|
|
#include <set>
|
|
#include <sstream>
|
|
#include <string>
|
|
#include <unordered_set>
|
|
#include <vector>
|
|
|
|
namespace rocprofiler
|
|
{
|
|
namespace tool
|
|
{
|
|
using common::get_env;
|
|
|
|
struct config;
|
|
|
|
enum class config_context
|
|
{
|
|
global = 0,
|
|
};
|
|
|
|
void
|
|
initialize();
|
|
|
|
template <config_context ContextT = config_context::global>
|
|
config&
|
|
get_config();
|
|
|
|
std::string
|
|
format_name(std::string_view _name, const config& = get_config<>());
|
|
|
|
struct att_perfcounter
|
|
{
|
|
std::string counter_name = {};
|
|
uint32_t simd_mask = 0xf;
|
|
|
|
template <typename ArchiveT>
|
|
void save(ArchiveT&) const;
|
|
|
|
friend bool operator==(const att_perfcounter& lhs, const att_perfcounter& rhs)
|
|
{
|
|
return std::tie(lhs.counter_name, lhs.simd_mask) ==
|
|
std::tie(rhs.counter_name, rhs.simd_mask);
|
|
}
|
|
};
|
|
|
|
struct config : output_config
|
|
{
|
|
using base_type = output_config;
|
|
|
|
struct CollectionPeriod
|
|
{
|
|
uint64_t delay = 0;
|
|
uint64_t duration = 0;
|
|
uint64_t repeat = 0;
|
|
|
|
template <typename ArchiveT>
|
|
void save(ArchiveT& ar) const;
|
|
};
|
|
|
|
enum class benchmark
|
|
{
|
|
none = 0,
|
|
disabled_contexts_overhead,
|
|
sdk_callback_overhead,
|
|
sdk_buffered_overhead,
|
|
tool_runtime_overhead,
|
|
execution_profile,
|
|
};
|
|
|
|
config();
|
|
|
|
~config() = default;
|
|
config(const config&) = default;
|
|
config(config&&) noexcept = default;
|
|
config& operator=(const config&) = default;
|
|
config& operator=(config&&) noexcept = default;
|
|
|
|
bool demangle = get_env("ROCPROF_DEMANGLE_KERNELS", true);
|
|
bool truncate = get_env("ROCPROF_TRUNCATE_KERNELS", false);
|
|
bool kernel_trace = get_env("ROCPROF_KERNEL_TRACE", false);
|
|
bool hsa_core_api_trace = get_env("ROCPROF_HSA_CORE_API_TRACE", false);
|
|
bool hsa_amd_ext_api_trace = get_env("ROCPROF_HSA_AMD_EXT_API_TRACE", false);
|
|
bool hsa_image_ext_api_trace = get_env("ROCPROF_HSA_IMAGE_EXT_API_TRACE", false);
|
|
bool hsa_finalizer_ext_api_trace = get_env("ROCPROF_HSA_FINALIZER_EXT_API_TRACE", false);
|
|
bool marker_api_trace = get_env("ROCPROF_MARKER_API_TRACE", false);
|
|
bool memory_copy_trace = get_env("ROCPROF_MEMORY_COPY_TRACE", false);
|
|
bool memory_allocation_trace = get_env("ROCPROF_MEMORY_ALLOCATION_TRACE", false);
|
|
bool scratch_memory_trace = get_env("ROCPROF_SCRATCH_MEMORY_TRACE", false);
|
|
bool counter_collection = get_env("ROCPROF_COUNTER_COLLECTION", false);
|
|
bool hip_runtime_api_trace = get_env("ROCPROF_HIP_RUNTIME_API_TRACE", false);
|
|
bool hip_compiler_api_trace = get_env("ROCPROF_HIP_COMPILER_API_TRACE", false);
|
|
bool rccl_api_trace = get_env("ROCPROF_RCCL_API_TRACE", false);
|
|
bool rocdecode_api_trace = get_env("ROCPROF_ROCDECODE_API_TRACE", false);
|
|
bool rocjpeg_api_trace = get_env("ROCPROF_ROCJPEG_API_TRACE", false);
|
|
bool list_metrics = get_env("ROCPROF_LIST_METRICS", false);
|
|
bool list_metrics_output_file = get_env("ROCPROF_OUTPUT_LIST_METRICS_FILE", false);
|
|
bool advanced_thread_trace = get_env("ROCPROF_ADVANCED_THREAD_TRACE", false);
|
|
bool att_serialize_all = get_env("ROCPROF_ATT_PARAM_SERIALIZE_ALL", false);
|
|
bool enable_signal_handlers = get_env("ROCPROF_SIGNAL_HANDLERS", true);
|
|
bool enable_process_sync = get_env("ROCPROF_PROCESS_SYNC", false);
|
|
bool selected_regions = get_env("ROCPROF_SELECTED_REGIONS", false);
|
|
bool output_config_file = get_env("ROCPROF_OUTPUT_CONFIG_FILE", false);
|
|
bool pc_sampling_host_trap = false;
|
|
bool pc_sampling_stochastic = false;
|
|
size_t pc_sampling_interval = get_env("ROCPROF_PC_SAMPLING_INTERVAL", 1);
|
|
rocprofiler_pc_sampling_method_t pc_sampling_method_value = ROCPROFILER_PC_SAMPLING_METHOD_NONE;
|
|
rocprofiler_pc_sampling_unit_t pc_sampling_unit_value = ROCPROFILER_PC_SAMPLING_UNIT_NONE;
|
|
|
|
int mpi_size = get_mpi_size();
|
|
int mpi_rank = get_mpi_rank();
|
|
uint64_t att_param_shader_engine_mask =
|
|
get_env<uint64_t>("ROCPROF_ATT_PARAM_SHADER_ENGINE_MASK", 0x1);
|
|
// 256MB
|
|
uint64_t att_param_buffer_size = get_env<uint64_t>("ROCPROF_ATT_PARAM_BUFFER_SIZE", 0x10000000);
|
|
uint64_t att_param_simd_select = get_env<uint64_t>("ROCPROF_ATT_PARAM_SIMD_SELECT", 0xF);
|
|
uint64_t att_param_target_cu = get_env<uint64_t>("ROCPROF_ATT_PARAM_TARGET_CU", 1);
|
|
uint64_t att_param_perf_ctrl = get_env<uint64_t>("ROCPROF_ATT_PARAM_PERFCOUNTER_CTRL", 0);
|
|
uint64_t att_consecutive_kernels = get_env<uint64_t>("ROCPROF_ATT_CONSECUTIVE_KERNELS", 0);
|
|
|
|
std::string kernel_filter_include = get_env("ROCPROF_KERNEL_FILTER_INCLUDE_REGEX", ".*");
|
|
std::string kernel_filter_exclude = get_env("ROCPROF_KERNEL_FILTER_EXCLUDE_REGEX", "");
|
|
std::string pc_sampling_method = get_env("ROCPROF_PC_SAMPLING_METHOD", "none");
|
|
std::string pc_sampling_unit = get_env("ROCPROF_PC_SAMPLING_UNIT", "none");
|
|
std::string extra_counters_contents = get_env("ROCPROF_EXTRA_COUNTERS_CONTENTS", "");
|
|
std::string att_library_path = get_env("ROCPROF_ATT_LIBRARY_PATH", "");
|
|
std::string att_gpu_index = get_env("ROCPROF_ATT_PARAM_GPU_INDEX", "");
|
|
|
|
std::unordered_set<size_t> kernel_filter_range = {};
|
|
std::vector<std::set<std::string>> counters = {};
|
|
std::vector<att_perfcounter> att_param_perfcounters = {};
|
|
|
|
std::queue<CollectionPeriod> collection_periods = {};
|
|
uint64_t counter_groups_random_seed = get_env("ROCPROF_COUNTER_GROUPS_RANDOM_SEED", 0);
|
|
uint64_t counter_groups_interval = get_env("ROCPROF_COUNTER_GROUPS_INTERVAL", 1);
|
|
uint64_t minimum_output_bytes = get_env("ROCPROF_MINIMUM_OUTPUT_BYTES", 0);
|
|
|
|
std::string benchmark_mode_env = get_env("ROCPROF_BENCHMARK_MODE", "");
|
|
benchmark benchmark_mode = benchmark::none;
|
|
|
|
template <typename ArchiveT>
|
|
void save(ArchiveT&) const;
|
|
|
|
template <typename ArchiveT>
|
|
void load(ArchiveT&)
|
|
{}
|
|
|
|
auto get_attach_invariants() const;
|
|
};
|
|
|
|
#define CFG_SERIALIZE_MEMBER(VAR) ar(cereal::make_nvp(#VAR, VAR))
|
|
#define CFG_SERIALIZE_NAMED_MEMBER(NAME, VAR) ar(cereal::make_nvp(NAME, VAR))
|
|
|
|
inline auto
|
|
config::get_attach_invariants() const
|
|
{
|
|
return std::make_tuple(kernel_trace,
|
|
hsa_core_api_trace,
|
|
hsa_amd_ext_api_trace,
|
|
hsa_image_ext_api_trace,
|
|
hsa_finalizer_ext_api_trace,
|
|
marker_api_trace,
|
|
memory_copy_trace,
|
|
memory_allocation_trace,
|
|
scratch_memory_trace,
|
|
counter_collection,
|
|
hip_runtime_api_trace,
|
|
hip_compiler_api_trace,
|
|
rccl_api_trace,
|
|
rocdecode_api_trace,
|
|
rocjpeg_api_trace,
|
|
advanced_thread_trace,
|
|
att_serialize_all,
|
|
att_param_shader_engine_mask,
|
|
att_param_buffer_size,
|
|
att_param_simd_select,
|
|
att_param_target_cu,
|
|
att_library_path,
|
|
att_param_perfcounters,
|
|
att_param_perf_ctrl,
|
|
pc_sampling_method,
|
|
pc_sampling_unit,
|
|
kernel_filter_include,
|
|
kernel_filter_exclude,
|
|
kernel_filter_range,
|
|
extra_counters_contents,
|
|
counter_groups_random_seed,
|
|
counter_groups_interval,
|
|
benchmark_mode);
|
|
}
|
|
|
|
inline bool
|
|
is_attach_invariant(const config& lhs, const config& rhs)
|
|
{
|
|
return lhs.get_attach_invariants() == rhs.get_attach_invariants();
|
|
}
|
|
|
|
template <typename ArchiveT>
|
|
void
|
|
att_perfcounter::save(ArchiveT& ar) const
|
|
{
|
|
CFG_SERIALIZE_MEMBER(counter_name);
|
|
CFG_SERIALIZE_MEMBER(simd_mask);
|
|
}
|
|
|
|
template <typename ArchiveT>
|
|
void
|
|
config::CollectionPeriod::save(ArchiveT& ar) const
|
|
{
|
|
CFG_SERIALIZE_MEMBER(delay);
|
|
CFG_SERIALIZE_MEMBER(duration);
|
|
CFG_SERIALIZE_MEMBER(repeat);
|
|
}
|
|
|
|
template <typename ArchiveT>
|
|
void
|
|
config::save(ArchiveT& ar) const
|
|
{
|
|
CFG_SERIALIZE_NAMED_MEMBER("benchmark_mode", benchmark_mode_env);
|
|
|
|
CFG_SERIALIZE_MEMBER(kernel_trace);
|
|
CFG_SERIALIZE_MEMBER(hsa_core_api_trace);
|
|
CFG_SERIALIZE_MEMBER(hsa_amd_ext_api_trace);
|
|
CFG_SERIALIZE_MEMBER(hsa_image_ext_api_trace);
|
|
CFG_SERIALIZE_MEMBER(hsa_finalizer_ext_api_trace);
|
|
CFG_SERIALIZE_MEMBER(marker_api_trace);
|
|
CFG_SERIALIZE_MEMBER(memory_copy_trace);
|
|
CFG_SERIALIZE_MEMBER(memory_allocation_trace);
|
|
CFG_SERIALIZE_MEMBER(scratch_memory_trace);
|
|
CFG_SERIALIZE_MEMBER(counter_collection);
|
|
CFG_SERIALIZE_MEMBER(hip_runtime_api_trace);
|
|
CFG_SERIALIZE_MEMBER(hip_compiler_api_trace);
|
|
CFG_SERIALIZE_MEMBER(rccl_api_trace);
|
|
CFG_SERIALIZE_MEMBER(rocdecode_api_trace);
|
|
CFG_SERIALIZE_MEMBER(rocjpeg_api_trace);
|
|
|
|
CFG_SERIALIZE_MEMBER(mpi_rank);
|
|
CFG_SERIALIZE_MEMBER(mpi_size);
|
|
CFG_SERIALIZE_MEMBER(collection_periods);
|
|
CFG_SERIALIZE_MEMBER(counters);
|
|
CFG_SERIALIZE_MEMBER(extra_counters_contents);
|
|
CFG_SERIALIZE_MEMBER(kernel_filter_include);
|
|
CFG_SERIALIZE_MEMBER(kernel_filter_exclude);
|
|
CFG_SERIALIZE_MEMBER(kernel_filter_range);
|
|
CFG_SERIALIZE_MEMBER(demangle);
|
|
CFG_SERIALIZE_MEMBER(truncate);
|
|
CFG_SERIALIZE_MEMBER(minimum_output_bytes);
|
|
CFG_SERIALIZE_MEMBER(enable_signal_handlers);
|
|
CFG_SERIALIZE_MEMBER(enable_process_sync);
|
|
CFG_SERIALIZE_MEMBER(selected_regions);
|
|
|
|
CFG_SERIALIZE_MEMBER(counter_groups_random_seed);
|
|
CFG_SERIALIZE_MEMBER(counter_groups_interval);
|
|
|
|
CFG_SERIALIZE_MEMBER(pc_sampling_host_trap);
|
|
CFG_SERIALIZE_MEMBER(pc_sampling_stochastic);
|
|
CFG_SERIALIZE_MEMBER(pc_sampling_method);
|
|
CFG_SERIALIZE_MEMBER(pc_sampling_unit);
|
|
CFG_SERIALIZE_MEMBER(pc_sampling_interval);
|
|
CFG_SERIALIZE_MEMBER(pc_sampling_method_value);
|
|
CFG_SERIALIZE_MEMBER(pc_sampling_unit_value);
|
|
|
|
CFG_SERIALIZE_MEMBER(advanced_thread_trace);
|
|
CFG_SERIALIZE_MEMBER(att_serialize_all);
|
|
CFG_SERIALIZE_MEMBER(att_param_shader_engine_mask);
|
|
CFG_SERIALIZE_MEMBER(att_param_buffer_size);
|
|
CFG_SERIALIZE_MEMBER(att_param_simd_select);
|
|
CFG_SERIALIZE_MEMBER(att_param_target_cu);
|
|
CFG_SERIALIZE_MEMBER(att_library_path);
|
|
CFG_SERIALIZE_MEMBER(att_param_perfcounters);
|
|
CFG_SERIALIZE_MEMBER(att_param_perf_ctrl);
|
|
CFG_SERIALIZE_MEMBER(att_consecutive_kernels);
|
|
|
|
// serialize the base class
|
|
static_cast<const base_type*>(this)->save(ar);
|
|
}
|
|
|
|
#undef CFG_SERIALIZE_MEMBER
|
|
#undef CFG_SERIALIZE_NAMED_MEMBER
|
|
|
|
template <config_context ContextT>
|
|
config&
|
|
get_config()
|
|
{
|
|
if constexpr(ContextT == config_context::global)
|
|
{
|
|
static auto* _v = new config{};
|
|
return *_v;
|
|
}
|
|
else
|
|
{
|
|
// context specific config copied from global config
|
|
static auto* _v = new config{get_config<config_context::global>()};
|
|
return *_v;
|
|
}
|
|
}
|
|
} // namespace tool
|
|
} // namespace rocprofiler
|