rocprofler_iterate_info workaround + omnitrace-avail update (#270)

* rocprofler_iterate_info workaround + omnitrace-avail update

- provides workaround for rocprofiler_iterate_info behavior change in ROCm 5.4.0-3
- update timemory submodule with argparse tweaks
- updates hsa_rsrc_factory.{hpp,cpp}
- colorized log in omnitrace-avail
- Bump version to 1.9.2

* Fix empty_base inheritance

- timemory's component::empty_base inherits from concepts::component so direct inheritance was removed

* Fix OMNITRACE_HIP_VERSION_COMPAT_STRING

- defined as "" when OMNITRACE_HIP_VERSION_MAJOR==0

* new defines + extra info

- define OMNITRACE_LIBRARY_ARCH (via CMAKE_LIBRARY_ARCHITECTURE)
- define OMNITRACE_SYSTEM_NAME (via CMAKE_SYSTEM_NAME)
- define OMNITRACE_SYSTEM_PROCESSOR (via CMAKE_SYSTEM_PROCESSOR)
- define OMNITRACE_SYSTEM_VERSION (via OMNITRACE_SYSTEM_VERSION)
- define OMNITRACE_COMPILER_ID (via CMAKE_CXX_COMPILER_ID)
- define OMNITRACE_COMPILER_VERSION (via CMAKE_CXX_COMPILER_VERSION)
- include this info in metadata
- include subset of this info in --version for bin tools
- tweak to perfetto verbose messages

[ROCm/rocprofiler-systems commit: 4ed5f3e67b]
Tento commit je obsažen v:
Jonathan R. Madsen
2023-03-30 04:21:43 -05:00
odevzdal GitHub
rodič a1213480e0
revize 70c8d1229c
21 změnil soubory, kde provedl 425 přidání a 148 odebrání
+1 -1
Zobrazit soubor
@@ -1 +1 @@
1.9.1
1.9.2
Submodul projects/rocprofiler-systems/external/timemory aktualizován: 1ab76c36ef...2b92a966d7
+35 -10
Zobrazit soubor
@@ -22,6 +22,7 @@
#include "avail.hpp"
#include "common.hpp"
#include "common/defines.h"
#include "component_categories.hpp"
#include "defines.hpp"
#include "enumerated_list.hpp"
@@ -108,7 +109,11 @@ void
write_hw_counter_info(std::ostream&, const array_t<bool, N>& = {},
const array_t<bool, N>& = {}, const array_t<string_t, N>& = {});
int gpu_count = 0;
namespace
{
// initialize HIP before main so that libomnitrace is not HSA_TOOLS_LIB
int gpu_count = omnitrace::gpu::hip_device_count();
} // namespace
//--------------------------------------------------------------------------------------//
@@ -174,11 +179,23 @@ main(int argc, char** argv)
parser_t parser("omnitrace-avail");
parser.enable_help();
parser.enable_version("omnitrace-avail", "v" OMNITRACE_VERSION_STRING,
OMNITRACE_GIT_DESCRIBE, OMNITRACE_GIT_REVISION);
parser.set_help_width(40);
auto _cols = std::get<0>(tim::utility::console::get_columns());
if(_cols > parser.get_help_width() + 8)
parser.set_description_width(
std::min<int>(_cols - parser.get_help_width() - 8, 120));
parser.enable_help();
parser.enable_version("omnitrace-avail", OMNITRACE_ARGPARSE_VERSION_INFO);
parser.add_argument({ "--monochrome" }, "Disable colorized output")
.max_count(1)
.dtype("bool")
.action([&](parser_t& p) {
auto _monochrome = p.get<bool>("monochrome");
tim::log::monochrome() = _monochrome;
p.set_use_color(!_monochrome);
});
parser.add_argument({ "--debug" }, "Enable debug messages")
.max_count(1)
.action([](parser_t& p) { debug_msg = p.get<bool>("debug"); });
@@ -468,12 +485,20 @@ main(int argc, char** argv)
}
#if OMNITRACE_USE_HIP > 0
// initialize HIP and call rocm_metrics() which add choices to OMNITRACE_ROCM_EVENTS
// setting
auto _status = hipGetDeviceCount(&gpu_count);
if(gpu_count > 0 && _status == hipSuccess)
if(gpu_count > 0)
{
(void) omnitrace::rocprofiler::rocm_metrics();
size_t _num_metrics = 0;
try
{
// call to rocm_metrics() will add choices to OMNITRACE_ROCM_EVENTS setting
// so always perform this call even if list of HW counters is not requested
_num_metrics = omnitrace::rocprofiler::rocm_metrics().size();
} catch(std::runtime_error& _e)
{
verbprintf(0, "Retrieving the GPU HW counters failed: %s", _e.what());
}
verbprintf(0, "Found %i HIP devices and %zu GPU HW counters\n", gpu_count,
_num_metrics);
}
else
{
+22 -4
Zobrazit soubor
@@ -116,6 +116,8 @@ extern std::string settings_rexclude_exact;
// leading matches, e.g. OMNITRACE_MPI_[A-Z_]+
extern std::string settings_rexclude_begin;
constexpr size_t max_error_message_buffer_length = 4096;
//--------------------------------------------------------------------------------------//
// functions
@@ -149,18 +151,27 @@ file_exists(const std::string&);
// control debug printf statements
#define errprintf(LEVEL, ...) \
{ \
if(werror || LEVEL < 0) \
if(LEVEL < verbose_level) \
{ \
if(debug_msg || verbose_level >= LEVEL) \
{ \
fprintf(stderr, "%s", tim::log::color::fatal()); \
fprintf(stderr, "[omnitrace][avail] Error! " __VA_ARGS__); \
char _buff[FUNCNAMELEN]; \
sprintf(_buff, "[omnitrace][avail] Error! " __VA_ARGS__); \
fprintf(stderr, "%s", tim::log::color::end()); \
} \
char _buff[max_error_message_buffer_length]; \
snprintf(_buff, max_error_message_buffer_length, \
"[omnitrace][avail] Error! " __VA_ARGS__); \
throw std::runtime_error(std::string{ _buff }); \
} \
else \
{ \
if(debug_msg || verbose_level >= LEVEL) \
{ \
fprintf(stderr, "%s", tim::log::color::warning()); \
fprintf(stderr, "[omnitrace][avail] Warning! " __VA_ARGS__); \
fprintf(stderr, "%s", tim::log::color::end()); \
} \
} \
fflush(stderr); \
}
@@ -169,12 +180,19 @@ file_exists(const std::string&);
#define verbprintf(LEVEL, ...) \
{ \
if(debug_msg || verbose_level >= LEVEL) \
{ \
fprintf(stderr, "%s", tim::log::color::info()); \
fprintf(stderr, "[omnitrace][avail] " __VA_ARGS__); \
fprintf(stderr, "%s", tim::log::color::end()); \
} \
fflush(stderr); \
}
#define verbprintf_bare(LEVEL, ...) \
{ \
if(debug_msg || verbose_level >= LEVEL) fprintf(stderr, __VA_ARGS__); \
if(debug_msg || verbose_level >= LEVEL) \
{ \
fprintf(stderr, __VA_ARGS__); \
} \
fflush(stderr); \
}
+1 -2
Zobrazit soubor
@@ -549,8 +549,7 @@ parse_args(int argc, char** argv, std::vector<char*>& _env,
});
parser.enable_help();
parser.enable_version("omnitrace-causal", "v" OMNITRACE_VERSION_STRING,
OMNITRACE_GIT_DESCRIBE, OMNITRACE_GIT_REVISION);
parser.enable_version("omnitrace-causal", OMNITRACE_ARGPARSE_VERSION_INFO);
auto _cols = std::get<0>(console::get_columns());
if(_cols > parser.get_help_width() + 8)
@@ -461,8 +461,7 @@ main(int argc, char** argv)
string_t extra_help = "-- <CMD> <ARGS>";
parser.enable_help();
parser.enable_version("omnitrace", "v" OMNITRACE_VERSION_STRING,
OMNITRACE_GIT_DESCRIBE, OMNITRACE_GIT_REVISION);
parser.enable_version("omnitrace-instrument", OMNITRACE_ARGPARSE_VERSION_INFO);
parser.add_argument({ "" }, "");
parser.add_argument({ "[DEBUG OPTIONS]" }, "");
+1 -2
Zobrazit soubor
@@ -292,8 +292,7 @@ parse_args(int argc, char** argv, parser_data_t& _parser_data)
});
parser.enable_help("", "Usage: omnitrace-run <OPTIONS> -- <COMMAND> <ARGS>");
parser.enable_version("omnitrace-run", "v" OMNITRACE_VERSION_STRING,
OMNITRACE_GIT_DESCRIBE, OMNITRACE_GIT_REVISION);
parser.enable_version("omnitrace-run", OMNITRACE_ARGPARSE_VERSION_INFO);
auto _cols = std::get<0>(console::get_columns());
if(_cols > parser.get_help_width() + 8)
+1 -2
Zobrazit soubor
@@ -369,8 +369,7 @@ parse_args(int argc, char** argv, std::vector<char*>& _env)
parser.set_use_color(true);
parser.enable_help();
parser.enable_version("omnitrace-sample", "v" OMNITRACE_VERSION_STRING,
OMNITRACE_GIT_DESCRIBE, OMNITRACE_GIT_REVISION);
parser.enable_version("omnitrace-sample", OMNITRACE_ARGPARSE_VERSION_INFO);
auto _cols = std::get<0>(tim::utility::console::get_columns());
if(_cols > parser.get_help_width() + 8)
+32
Zobrazit soubor
@@ -30,6 +30,17 @@
#define OMNITRACE_GIT_DESCRIBE "@OMNITRACE_GIT_DESCRIBE@"
#define OMNITRACE_GIT_REVISION "@OMNITRACE_GIT_REVISION@"
// system info during compilation
#define OMNITRACE_LIBRARY_ARCH "@CMAKE_LIBRARY_ARCHITECTURE@"
#define OMNITRACE_SYSTEM_NAME "@CMAKE_SYSTEM_NAME@"
#define OMNITRACE_SYSTEM_PROCESSOR "@CMAKE_SYSTEM_PROCESSOR@"
#define OMNITRACE_SYSTEM_VERSION "@CMAKE_SYSTEM_VERSION@"
// compiler information
#define OMNITRACE_COMPILER_ID "@CMAKE_CXX_COMPILER_ID@"
#define OMNITRACE_COMPILER_VERSION "@CMAKE_CXX_COMPILER_VERSION@"
#define OMNITRACE_COMPILER_STRING OMNITRACE_COMPILER_ID " v" OMNITRACE_COMPILER_VERSION
#define OMNITRACE_DEFAULT_ROCM_PATH "@ROCmVersion_DIR@"
#define OMNITRACE_HIP_VERSION_STRING "@OMNITRACE_HIP_VERSION@"
#define OMNITRACE_HIP_VERSION_MAJOR @OMNITRACE_HIP_VERSION_MAJOR@
@@ -45,6 +56,27 @@
((10000 * OMNITRACE_HIP_VERSION_MAJOR) + (100 * OMNITRACE_HIP_VERSION_MINOR) + \
OMNITRACE_HIP_VERSION_PATCH)
#if OMNITRACE_HIP_VERSION_MAJOR > 0
# define OMNITRACE_HIP_VERSION_COMPAT_STRING \
"v@OMNITRACE_HIP_VERSION_MAJOR@.@OMNITRACE_HIP_VERSION_MINOR@.x"
#else
# define OMNITRACE_HIP_VERSION_COMPAT_STRING ""
#endif
// this should be passed to argparse::argument_parser::enable_version
// Example:
// parser.enable_version(<name>, OMNITRACE_ARGPARSE_VERSION_INFO);
#if !defined(OMNITRACE_ARGPARSE_VERSION_INFO)
# define OMNITRACE_ARGPARSE_VERSION_INFO \
"v" OMNITRACE_VERSION_STRING, OMNITRACE_GIT_DESCRIBE, OMNITRACE_GIT_REVISION, \
{ \
{ "", OMNITRACE_LIBRARY_ARCH }, { "compiler", OMNITRACE_COMPILER_STRING }, \
{ \
"rocm", OMNITRACE_HIP_VERSION_COMPAT_STRING \
} \
}
#endif
// clang-format off
#if !defined(OMNITRACE_MAX_THREADS)
# define OMNITRACE_MAX_THREADS @OMNITRACE_MAX_THREADS@
+33 -26
Zobrazit soubor
@@ -255,6 +255,14 @@ configure_settings(bool _init)
tim::manager::add_metadata("OMNITRACE_GIT_DESCRIBE", OMNITRACE_GIT_DESCRIBE);
tim::manager::add_metadata("OMNITRACE_GIT_REVISION", OMNITRACE_GIT_REVISION);
tim::manager::add_metadata("OMNITRACE_LIBRARY_ARCH", OMNITRACE_LIBRARY_ARCH);
tim::manager::add_metadata("OMNITRACE_SYSTEM_NAME", OMNITRACE_SYSTEM_NAME);
tim::manager::add_metadata("OMNITRACE_SYSTEM_PROCESSOR", OMNITRACE_SYSTEM_PROCESSOR);
tim::manager::add_metadata("OMNITRACE_SYSTEM_VERSION", OMNITRACE_SYSTEM_VERSION);
tim::manager::add_metadata("OMNITRACE_COMPILER_ID", OMNITRACE_COMPILER_ID);
tim::manager::add_metadata("OMNITRACE_COMPILER_VERSION", OMNITRACE_COMPILER_VERSION);
#if OMNITRACE_HIP_VERSION > 0
tim::manager::add_metadata("OMNITRACE_HIP_VERSION", OMNITRACE_HIP_VERSION_STRING);
tim::manager::add_metadata("OMNITRACE_HIP_VERSION_MAJOR",
@@ -1511,38 +1519,37 @@ print_banner(std::ostream& _os)
\______/ |__| |__| |__| \__| |__| |__| | _| `._____/__/ \__\ \______||_______|
)banner";
auto _tag = std::string_view{ OMNITRACE_GIT_DESCRIBE };
auto _rev = std::string_view{ OMNITRACE_GIT_REVISION };
#if OMNITRACE_HIP_VERSION_MAJOR > 0
auto _hip = JOIN('.', OMNITRACE_HIP_VERSION_MAJOR, OMNITRACE_HIP_VERSION_MINOR, "x");
#else
auto _hip = std::string_view{};
#endif
std::stringstream _version_info{};
_version_info << "omnitrace v" << OMNITRACE_VERSION_STRING;
if(!_tag.empty() || !_rev.empty() || !_hip.empty())
{
_version_info << " (";
if(!_tag.empty())
{
_version_info << "tag: " << OMNITRACE_GIT_DESCRIBE;
if(!_rev.empty()) _version_info << ", ";
}
if(!_rev.empty())
{
_version_info << "rev: " << OMNITRACE_GIT_REVISION;
if(!_hip.empty()) _version_info << ", ";
}
namespace join = ::timemory::join;
if(!_hip.empty())
{
_version_info << "rocm: " << _hip;
}
}
// assemble the list of properties
auto _generate_properties =
[](std::initializer_list<std::pair<std::string, std::string>>&& _data) {
auto _property_info = std::vector<std::string>{};
_property_info.reserve(_data.size());
for(const auto& itr : _data)
{
if(!itr.second.empty())
_property_info.emplace_back(
itr.first.empty() ? itr.second
: join::join(": ", itr.first, itr.second));
}
return _property_info;
};
if(!_version_info.str().empty()) _version_info << ")";
auto _properties =
_generate_properties({ { "rev", OMNITRACE_GIT_REVISION },
{ "tag", OMNITRACE_GIT_DESCRIBE },
{ "", OMNITRACE_LIBRARY_ARCH },
{ "compiler", OMNITRACE_COMPILER_STRING },
{ "rocm", OMNITRACE_HIP_VERSION_COMPAT_STRING } });
// <NAME> <VERSION> (<PROPERTIES>)
if(!_properties.empty())
_version_info << join::join(join::array_config{ ", ", " (", ")" }, _properties);
tim::log::stream(_os, tim::log::color::info()) << _banner << _version_info.str();
_os << std::endl;
+2 -2
Zobrazit soubor
@@ -124,12 +124,12 @@ start()
}
else
{
OMNITRACE_VERBOSE(0, "Resuming perfetto...\n");
OMNITRACE_VERBOSE(2, "Resuming perfetto...\n");
_tmp_file->fopen("a+");
}
}
OMNITRACE_VERBOSE(0, "Setup perfetto...\n");
OMNITRACE_VERBOSE(2, "Setup perfetto...\n");
int _fd = (_tmp_file) ? _tmp_file->fd : -1;
auto& cfg = get_config();
tracing_session->Setup(cfg, _fd);
@@ -45,17 +45,13 @@ namespace causal
{
namespace component
{
struct sample_rate
: tim::component::empty_base
, tim::concepts::component
struct sample_rate : comp::empty_base
{
using value_type = void;
static void sample(int = -1);
};
struct backtrace
: tim::component::empty_base
, tim::concepts::component
struct backtrace : comp::empty_base
{
using value_type = void;
using sample_data_set_t = std::set<sample_data>;
@@ -38,9 +38,7 @@ namespace omnitrace
{
namespace causal
{
struct delay
: tim::component::empty_base
, tim::concepts::component
struct delay : comp::empty_base
{
using value_type = void;
@@ -45,9 +45,7 @@ namespace omnitrace
{
namespace component
{
struct backtrace
: tim::component::empty_base
, tim::concepts::component
struct backtrace : comp::empty_base
{
static constexpr size_t stack_depth = OMNITRACE_MAX_UNWIND_DEPTH;
@@ -51,9 +51,7 @@ using type_list = ::tim::type_list<Tp...>;
namespace component
{
struct backtrace_metrics
: tim::component::empty_base
, concepts::component
struct backtrace_metrics : comp::empty_base
{
static constexpr size_t num_hw_counters = TIMEMORY_PAPI_ARRAY_SIZE;
@@ -38,9 +38,7 @@ namespace omnitrace
{
namespace component
{
struct backtrace_timestamp
: tim::component::empty_base
, tim::concepts::component
struct backtrace_timestamp : comp::empty_base
{
using value_type = void;
@@ -34,12 +34,11 @@ namespace omnitrace
namespace component
{
struct cpu_freq
: tim::concepts::component
, tim::component::empty_base
: comp::empty_base
, tim::component::base_format<cpu_freq>
, tim::component::base_data<std::vector<uint64_t>, 1>
{
using base_type = tim::component::empty_base;
using base_type = comp::empty_base;
using this_type = cpu_freq;
using value_type = std::vector<uint64_t>;
using storage_type = tim::storage<cpu_freq, value_type>;
@@ -127,6 +127,8 @@ extern "C"
void OnLoadToolProp(rocprofiler_settings_t* settings)
{
using ::rocprofiler::util::HsaRsrcFactory;
OMNITRACE_BASIC_VERBOSE_F(2 || rocm::on_load_trace, "Loading...\n");
rocm::lock_t _lk{ rocm::rocm_mutex, std::defer_lock };
@@ -332,6 +334,8 @@ extern "C"
}
else
{
using ::rocprofiler::util::HsaRsrcFactory;
HsaRsrcFactory::Instance().PrintGpuAgents("ROCm");
}
@@ -28,6 +28,7 @@ THE SOFTWARE.
#include <atomic>
#include <cassert>
#include <cxxabi.h>
#include <dlfcn.h>
#include <fcntl.h>
#include <fstream>
@@ -43,8 +44,23 @@ THE SOFTWARE.
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <vector>
namespace rocprofiler
{
namespace util
{
// Demangle C++ symbol name
static const char*
cpp_demangle(const char* symname)
{
size_t size = 0;
int status;
const char* ret = abi::__cxa_demangle(symname, nullptr, &size, &status);
return (ret != nullptr) ? ret : strdup(symname);
}
// Callback function to get available in the system agents
hsa_status_t
HsaRsrcFactory::GetHsaAgentsCallback(hsa_agent_t agent, void* data)
@@ -168,10 +184,20 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa)
CHECK_STATUS("HSA timer allocation failed",
(timer_ == nullptr) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS);
// Time correlation
const uint32_t corr_iters = 1000;
for(unsigned time_id = 0; time_id < HsaTimer::TIME_ID_NUMBER; time_id += 1)
{
CorrelateTime((HsaTimer::time_id_t) time_id, corr_iters);
}
// System timeout
timeout_ = (timeout_ns_ == HsaTimer::TIMESTAMP_MAX)
? timeout_ns_
: timer_->ns_to_sysclock(timeout_ns_);
// To dump code objects
to_dump_code_obj_ = getenv("ROCP_DUMP_CODEOBJ");
}
// Destructor of the class
@@ -214,12 +240,12 @@ HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table)
hsa_api_.hsa_queue_create = table->core_->hsa_queue_create_fn;
hsa_api_.hsa_queue_destroy = table->core_->hsa_queue_destroy_fn;
hsa_api_.hsa_queue_load_write_index_relaxed =
table->core_->hsa_queue_load_write_index_relaxed_fn;
hsa_api_.hsa_queue_store_write_index_relaxed =
table->core_->hsa_queue_store_write_index_relaxed_fn;
hsa_api_.hsa_queue_load_read_index_relaxed =
table->core_->hsa_queue_load_read_index_relaxed_fn;
hsa_api_.hsa_queue_load_write_index_relaxed =
table->core_->hsa_queue_load_write_index_relaxed_fn;
hsa_api_.hsa_queue_add_write_index_scacq_screl =
table->core_->hsa_queue_add_write_index_scacq_screl_fn;
hsa_api_.hsa_signal_create = table->core_->hsa_signal_create_fn;
hsa_api_.hsa_signal_destroy = table->core_->hsa_signal_destroy_fn;
@@ -236,7 +262,8 @@ HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table)
table->core_->hsa_executable_create_alt_fn;
hsa_api_.hsa_executable_load_agent_code_object =
table->core_->hsa_executable_load_agent_code_object_fn;
hsa_api_.hsa_executable_freeze = table->core_->hsa_executable_freeze_fn;
hsa_api_.hsa_executable_freeze = table->core_->hsa_executable_freeze_fn;
hsa_api_.hsa_executable_destroy = table->core_->hsa_executable_destroy_fn;
hsa_api_.hsa_executable_get_symbol =
table->core_->hsa_executable_get_symbol_fn;
hsa_api_.hsa_executable_symbol_get_info =
@@ -277,12 +304,12 @@ HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table)
hsa_api_.hsa_queue_create = hsa_queue_create;
hsa_api_.hsa_queue_destroy = hsa_queue_destroy;
hsa_api_.hsa_queue_load_write_index_relaxed =
hsa_queue_load_write_index_relaxed;
hsa_api_.hsa_queue_store_write_index_relaxed =
hsa_queue_store_write_index_relaxed;
hsa_api_.hsa_queue_load_read_index_relaxed =
hsa_queue_load_read_index_relaxed;
hsa_api_.hsa_queue_load_write_index_relaxed =
hsa_queue_load_write_index_relaxed;
hsa_api_.hsa_queue_add_write_index_scacq_screl =
hsa_queue_add_write_index_scacq_screl;
hsa_api_.hsa_signal_create = hsa_signal_create;
hsa_api_.hsa_signal_destroy = hsa_signal_destroy;
@@ -297,6 +324,7 @@ HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table)
hsa_api_.hsa_executable_load_agent_code_object =
hsa_executable_load_agent_code_object;
hsa_api_.hsa_executable_freeze = hsa_executable_freeze;
hsa_api_.hsa_executable_destroy = hsa_executable_destroy;
hsa_api_.hsa_executable_get_symbol = hsa_executable_get_symbol;
hsa_api_.hsa_executable_symbol_get_info = hsa_executable_symbol_get_info;
hsa_api_.hsa_executable_iterate_symbols = hsa_executable_iterate_symbols;
@@ -400,8 +428,10 @@ HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent)
agent_info->dev_id = agent;
agent_info->dev_type = HSA_DEVICE_TYPE_GPU;
hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name);
strncpy(agent_info->gfxip, agent_info->name, 4);
agent_info->gfxip[4] = '\0';
const int gfxip_label_len =
std::min(strlen(agent_info->name) - 2, sizeof(agent_info->gfxip) - 1);
memcpy(agent_info->gfxip, agent_info->name, gfxip_label_len);
agent_info->gfxip[gfxip_label_len] = '\0';
hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
&agent_info->max_wave_size);
hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
@@ -437,7 +467,12 @@ HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent)
agent_info->vgpr_block_size = 4;
// Set GPU index
agent_info->dev_index = gpu_list_.size();
uint32_t driver_node_id;
status = hsa_api_.hsa_agent_get_info(
agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_DRIVER_NODE_ID),
&driver_node_id);
CHECK_STATUS("hsa_agent_get_info(gpu hsa_driver_node_id)", status);
agent_info->dev_index = driver_node_id;
gpu_list_.push_back(agent_info);
gpu_agents_.push_back(agent);
}
@@ -646,22 +681,27 @@ HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t size)
}
// Wait signal
void
HsaRsrcFactory::SignalWait(const hsa_signal_t& signal) const
hsa_signal_value_t
HsaRsrcFactory::SignalWait(const hsa_signal_t& signal,
const hsa_signal_value_t& signal_value) const
{
const hsa_signal_value_t exp_value = signal_value - 1;
hsa_signal_value_t ret_value = signal_value;
while(true)
{
const hsa_signal_value_t signal_value = hsa_api_.hsa_signal_wait_scacquire(
signal, HSA_SIGNAL_CONDITION_LT, 1, timeout_, HSA_WAIT_STATE_BLOCKED);
if(signal_value == 0)
ret_value = hsa_api_.hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT,
signal_value, timeout_,
HSA_WAIT_STATE_BLOCKED);
if(ret_value == exp_value) break;
if(ret_value != signal_value)
{
break;
}
else
{
CHECK_STATUS("hsa_signal_wait_scacquire()", HSA_STATUS_ERROR);
std::cerr << "Error: HsaRsrcFactory::SignalWait: signal_value("
<< signal_value << "), ret_value(" << ret_value << ")" << std::endl
<< std::flush;
abort();
}
}
return ret_value;
}
// Wait signal with signal value restore
@@ -669,7 +709,7 @@ void
HsaRsrcFactory::SignalWaitRestore(const hsa_signal_t& signal,
const hsa_signal_value_t& signal_value) const
{
SignalWait(signal);
SignalWait(signal, signal_value);
hsa_api_.hsa_signal_store_relaxed(const_cast<hsa_signal_t&>(signal), signal_value);
}
@@ -686,7 +726,7 @@ HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src, siz
status = hsa_api_.hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size,
0, nullptr, s);
CHECK_STATUS("hsa_amd_memory_async_copy()", status);
SignalWait(s);
SignalWait(s, 1);
status = hsa_api_.hsa_signal_destroy(s);
CHECK_STATUS("hsa_signal_destroy()", status);
}
@@ -766,6 +806,8 @@ HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_pa
agent_info->dev_id, 0, &kernelSymbol);
CHECK_STATUS("Error in looking up kernel symbol", status);
close(file_handle);
// Update output parameter
*code_desc = kernelSymbol;
return true;
@@ -808,20 +850,36 @@ HsaRsrcFactory::PrintGpuAgents(const std::string&)
return true;
}
void*
HsaRsrcFactory::GetSlotPointer(hsa_queue_t* queue, const uint64_t& idx)
{
const uint32_t slot_size_b = CMD_SLOT_SIZE_B;
const uint32_t slot_idx = (uint32_t)(idx % queue->size);
void* queue_slot = reinterpret_cast<void*>((uintptr_t)(queue->base_address) +
(slot_idx * slot_size_b));
return queue_slot;
}
void*
HsaRsrcFactory::GetReadPointer(hsa_queue_t* queue)
{
const uint64_t read_idx = hsa_api_.hsa_queue_load_read_index_relaxed(queue);
return GetSlotPointer(queue, read_idx);
}
uint64_t
HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet)
{
const uint32_t slot_size_b = CMD_SLOT_SIZE_B;
// adevance command queue
const uint64_t write_idx = hsa_api_.hsa_queue_load_write_index_relaxed(queue);
hsa_api_.hsa_queue_store_write_index_relaxed(queue, write_idx + 1);
const uint64_t write_idx = hsa_api_.hsa_queue_add_write_index_scacq_screl(queue, 1);
while((write_idx - hsa_api_.hsa_queue_load_read_index_relaxed(queue)) >= queue->size)
{
sched_yield();
}
uint32_t slot_idx = (uint32_t)(write_idx % queue->size);
const uint32_t slot_idx = (uint32_t)(write_idx % queue->size);
uint32_t* queue_slot = reinterpret_cast<uint32_t*>((uintptr_t)(queue->base_address) +
(slot_idx * slot_size_b));
const uint32_t* slot_data = reinterpret_cast<const uint32_t*>(packet);
@@ -862,29 +920,32 @@ HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes
}
const char*
HsaRsrcFactory::GetKernelName(uint64_t addr)
HsaRsrcFactory::GetKernelNameRef(uint64_t addr)
{
std::lock_guard<mutex_t> lck(mutex_);
const auto it = symbols_map_->find(addr);
if(it == symbols_map_->end())
{
fprintf(stderr, "HsaRsrcFactory::kernel addr (0x%lx) is not found\n", addr);
fprintf(stderr,
"HsaRsrcFactory::GetKernelNameRef: kernel addr (0x%lx) is not found\n",
addr);
abort();
}
return strdup(it->second);
return it->second;
}
void
HsaRsrcFactory::EnableExecutableTracking(HsaApiTable* table)
{
std::lock_guard<mutex_t> lck(mutex_);
executable_tracking_on_ = true;
table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor;
executable_tracking_on_ = true;
table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor;
table->core_->hsa_executable_destroy_fn = hsa_executable_destroy_interceptor;
}
hsa_status_t
HsaRsrcFactory::executable_symbols_cb(hsa_executable_t /*exec*/,
hsa_executable_symbol_t symbol, void* /*data*/)
hsa_executable_symbol_t symbol, void* data)
{
hsa_symbol_kind_t value = (hsa_symbol_kind_t) 0;
hsa_status_t status = hsa_api_.hsa_executable_symbol_get_info(
@@ -900,17 +961,26 @@ HsaRsrcFactory::executable_symbols_cb(hsa_executable_t /*exec*/,
status = hsa_api_.hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &len);
CHECK_STATUS("Error in getting name len", status);
char* name = new char[len + 1];
status = hsa_api_.hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
char* symname = new char[len + 1];
status = hsa_api_.hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, symname);
CHECK_STATUS("Error in getting kernel name", status);
name[len] = 0;
auto ret = symbols_map_->insert({ addr, name });
if(ret.second == false)
symname[len] = 0;
if(data == nullptr)
{
delete[] ret.first->second;
ret.first->second = name;
const char* name = cpp_demangle(symname);
auto ret = symbols_map_->insert({ addr, name });
if(ret.second == false)
{
delete[] ret.first->second;
ret.first->second = name;
}
}
else
{
symbols_map_->erase(addr);
}
delete[] symname;
}
return HSA_STATUS_SUCCESS;
}
@@ -925,7 +995,19 @@ HsaRsrcFactory::hsa_executable_freeze_interceptor(hsa_executable_t executable,
executable, executable_symbols_cb, nullptr);
CHECK_STATUS("Error in iterating executable symbols", status);
return hsa_api_.hsa_executable_freeze(executable, options);
;
}
hsa_status_t
HsaRsrcFactory::hsa_executable_destroy_interceptor(hsa_executable_t executable)
{
std::lock_guard<mutex_t> lck(mutex_);
if(symbols_map_ != nullptr)
{
hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols(
executable, executable_symbols_cb, (void*) 1);
CHECK_STATUS("Error in iterating executable symbols", status);
}
return hsa_api_.hsa_executable_destroy(executable);
}
std::atomic<HsaRsrcFactory*> HsaRsrcFactory::instance_{};
@@ -934,3 +1016,7 @@ HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP
hsa_pfn_t HsaRsrcFactory::hsa_api_{};
bool HsaRsrcFactory::executable_tracking_on_ = false;
HsaRsrcFactory::symbols_map_t* HsaRsrcFactory::symbols_map_ = nullptr;
void* HsaRsrcFactory::to_dump_code_obj_ = nullptr;
} // namespace util
} // namespace rocprofiler
@@ -38,6 +38,7 @@
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <iostream>
#include <map>
#include <mutex>
@@ -77,6 +78,10 @@
} \
} while(0)
namespace rocprofiler
{
namespace util
{
static const size_t MEM_PAGE_BYTES = 0x1000;
static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1;
typedef decltype(hsa_agent_t::handle) hsa_agent_handle_t;
@@ -88,11 +93,12 @@ struct hsa_pfn_t
decltype(::hsa_agent_get_info)* hsa_agent_get_info;
decltype(::hsa_iterate_agents)* hsa_iterate_agents;
decltype(::hsa_queue_create)* hsa_queue_create;
decltype(::hsa_queue_destroy)* hsa_queue_destroy;
decltype(::hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed;
decltype(::hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed;
decltype(::hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed;
decltype(::hsa_queue_create)* hsa_queue_create;
decltype(::hsa_queue_destroy)* hsa_queue_destroy;
decltype(::hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed;
decltype(::hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed;
decltype(
::hsa_queue_add_write_index_scacq_screl)* hsa_queue_add_write_index_scacq_screl;
decltype(::hsa_signal_create)* hsa_signal_create;
decltype(::hsa_signal_destroy)* hsa_signal_destroy;
@@ -107,6 +113,7 @@ struct hsa_pfn_t
decltype(
::hsa_executable_load_agent_code_object)* hsa_executable_load_agent_code_object;
decltype(::hsa_executable_freeze)* hsa_executable_freeze;
decltype(::hsa_executable_destroy)* hsa_executable_destroy;
decltype(::hsa_executable_get_symbol)* hsa_executable_get_symbol;
decltype(::hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info;
decltype(::hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols;
@@ -180,10 +187,11 @@ struct AgentInfo
// Number of Shader Arrays Per Shader Engines in Gpu
uint32_t shader_arrays_per_se;
// SGPR/VGPR block sizes
uint32_t sgpr_block_dflt;
uint32_t sgpr_block_size;
uint32_t vgpr_block_size;
// SGPR/VGPR/LDS block sizes
uint32_t sgpr_block_dflt;
uint32_t sgpr_block_size;
uint32_t vgpr_block_size;
static const uint32_t lds_block_size = 128 * 4;
};
// HSA timer class
@@ -195,6 +203,16 @@ public:
static const timestamp_t TIMESTAMP_MAX = UINT64_MAX;
typedef long double freq_t;
enum time_id_t
{
TIME_ID_CLOCK_REALTIME = 0,
TIME_ID_CLOCK_REALTIME_COARSE = 1,
TIME_ID_CLOCK_MONOTONIC = 2,
TIME_ID_CLOCK_MONOTONIC_COARSE = 3,
TIME_ID_CLOCK_MONOTONIC_RAW = 4,
TIME_ID_NUMBER
};
HsaTimer(const hsa_pfn_t* hsa_api)
: hsa_api_(hsa_api)
{
@@ -215,6 +233,12 @@ public:
return timestamp_t((freq_t) time / sysclock_factor_);
}
// Method for timespec/ns conversion
static timestamp_t timespec_to_ns(const timespec& time)
{
return ((timestamp_t) time.tv_sec * 1000000000) + time.tv_nsec;
}
// Return timestamp in 'ns'
timestamp_t timestamp_ns() const
{
@@ -225,6 +249,57 @@ public:
return sysclock_to_ns(sysclock);
}
// Return time in 'ns'
timestamp_t clocktime_ns(clockid_t clock_id) const
{
timespec time;
clock_gettime(clock_id, &time);
return timespec_to_ns(time);
}
// Return pair of correlated values of profiling timestamp and time with
// correlation error for a given time ID and number of iterations
void correlated_pair_ns(time_id_t time_id, uint32_t iters, timestamp_t* timestamp_v,
timestamp_t* time_v, timestamp_t* error_v)
{
clockid_t clock_id = 0;
switch(time_id)
{
case TIME_ID_CLOCK_REALTIME: clock_id = CLOCK_REALTIME; break;
case TIME_ID_CLOCK_REALTIME_COARSE: clock_id = CLOCK_REALTIME_COARSE; break;
case TIME_ID_CLOCK_MONOTONIC: clock_id = CLOCK_MONOTONIC; break;
case TIME_ID_CLOCK_MONOTONIC_COARSE: clock_id = CLOCK_MONOTONIC_COARSE; break;
case TIME_ID_CLOCK_MONOTONIC_RAW: clock_id = CLOCK_MONOTONIC_RAW; break;
default: CHECK_STATUS("internal error: invalid time_id", HSA_STATUS_ERROR);
}
std::vector<timestamp_t> ts_vec(iters);
std::vector<timespec> tm_vec(iters);
const uint32_t steps = iters - 1;
for(uint32_t i = 0; i < iters; ++i)
{
hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &ts_vec[i]);
clock_gettime(clock_id, &tm_vec[i]);
}
const timestamp_t ts_base = sysclock_to_ns(ts_vec.front());
const timestamp_t tm_base = timespec_to_ns(tm_vec.front());
const timestamp_t error = (ts_vec.back() - ts_vec.front()) / (2 * steps);
timestamp_t ts_accum = 0;
timestamp_t tm_accum = 0;
for(uint32_t i = 0; i < iters; ++i)
{
ts_accum += (ts_vec[i] - ts_base);
tm_accum += (timespec_to_ns(tm_vec[i]) - tm_base);
}
*timestamp_v = (ts_accum / iters) + ts_base + error;
*time_v = (tm_accum / iters) + tm_base;
*error_v = error;
}
private:
// Timestamp frequency factor
freq_t sysclock_factor_;
@@ -332,7 +407,8 @@ public:
uint8_t* AllocateCmdMemory(const AgentInfo* agent_info, size_t size);
// Wait signal
void SignalWait(const hsa_signal_t& signal) const;
hsa_signal_value_t SignalWait(const hsa_signal_t& signal,
const hsa_signal_value_t& signal_value) const;
// Wait signal with signal value restore
void SignalWaitRestore(const hsa_signal_t& signal,
@@ -359,14 +435,16 @@ public:
// Print the various fields of Hsa Gpu Agents
bool PrintGpuAgents(const std::string& header);
// Submit AQL packet to given queue
// Utils for submitting AQL packet to a given queue
static void* GetSlotPointer(hsa_queue_t* queue, const uint64_t& idx);
static void* GetReadPointer(hsa_queue_t* queue);
static uint64_t Submit(hsa_queue_t* queue, const void* packet);
static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes);
// Enable executables loading tracking
static bool IsExecutableTracking() { return executable_tracking_on_; }
static void EnableExecutableTracking(HsaApiTable* table);
static const char* GetKernelName(uint64_t addr);
static const char* GetKernelNameRef(uint64_t addr);
// Initialize HSA API table
void static InitHsaApiTable(HsaApiTable* table);
@@ -400,6 +478,29 @@ public:
Instance().timeout_ = Instance().timer_->ns_to_sysclock(time);
}
void CorrelateTime(HsaTimer::time_id_t time_id, uint32_t iters)
{
timestamp_t timestamp_v = 0;
timestamp_t time_v = 0;
timestamp_t error_v = 0;
timer_->correlated_pair_ns(time_id, iters, &timestamp_v, &time_v, &error_v);
time_shift_[time_id] = time_v - timestamp_v;
time_error_[time_id] = error_v;
}
hsa_status_t GetTimeVal(uint32_t time_id, uint64_t time_stamp, uint64_t* time_value)
{
if(time_id >= HsaTimer::TIME_ID_NUMBER) return HSA_STATUS_ERROR;
*time_value = time_stamp + time_shift_[time_id];
return HSA_STATUS_SUCCESS;
}
hsa_status_t GetTimeErr(uint32_t time_id, uint64_t* err)
{
*err = time_error_[time_id];
return HSA_STATUS_SUCCESS;
}
private:
// System agents iterating callback
static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data);
@@ -444,8 +545,10 @@ private:
typedef std::map<uint64_t, const char*> symbols_map_t;
static symbols_map_t* symbols_map_;
static bool executable_tracking_on_;
static void* to_dump_code_obj_;
static hsa_status_t hsa_executable_freeze_interceptor(hsa_executable_t executable,
const char* options);
static hsa_status_t hsa_executable_destroy_interceptor(hsa_executable_t executable);
static hsa_status_t executable_symbols_cb(hsa_executable_t exec,
hsa_executable_symbol_t symbol, void* data);
@@ -466,7 +569,14 @@ private:
// HSA timer
HsaTimer* timer_;
// Time shift array to support time conversion
timestamp_t time_shift_[HsaTimer::TIME_ID_NUMBER];
timestamp_t time_error_[HsaTimer::TIME_ID_NUMBER];
// CPU/kern-arg memory pools
hsa_amd_memory_pool_t* cpu_pool_;
hsa_amd_memory_pool_t* kern_arg_pool_;
};
} // namespace util
} // namespace rocprofiler
+33 -21
Zobrazit soubor
@@ -56,6 +56,9 @@ namespace rocprofiler
{
namespace
{
using ::rocprofiler::util::AgentInfo;
using ::rocprofiler::util::HsaRsrcFactory;
auto&
get_event_names()
{
@@ -271,30 +274,25 @@ metrics_input(unsigned _device, rocprofiler_feature_t** ret)
return feature_count;
}
struct info_data
{
const AgentInfo* agent = nullptr;
std::vector<component::rocm_info_entry>* data = nullptr;
};
using info_data = std::vector<component::rocm_info_entry>;
hsa_status_t
info_data_callback(const rocprofiler_info_data_t info, void* arg)
{
using qualifier_t = tim::hardware_counters::qualifier;
using qualifier_vec_t = std::vector<qualifier_t>;
auto* _arg = static_cast<info_data*>(arg);
const auto* _agent = _arg->agent;
auto* _data = _arg->data;
auto* _data = static_cast<info_data*>(arg);
auto _dev_index = info.agent_index;
switch(info.kind)
{
case ROCPROFILER_INFO_KIND_METRIC:
{
auto _device_qualifier_sym = JOIN("", ":device=", _agent->dev_index);
auto _device_qualifier = tim::hardware_counters::qualifier{
true, static_cast<int>(_agent->dev_index), _device_qualifier_sym,
JOIN(" ", "Device", _agent->dev_index)
};
auto _device_qualifier_sym = JOIN("", ":device=", _dev_index);
auto _device_qualifier =
tim::hardware_counters::qualifier{ true, static_cast<int>(_dev_index),
_device_qualifier_sym,
JOIN(" ", "Device", _dev_index) };
auto _long_desc = std::string{ info.metric.description };
auto _units = std::string{};
auto _pysym = std::string{};
@@ -313,7 +311,7 @@ info_data_callback(const rocprofiler_info_data_t info, void* arg)
{
auto _sym = JOIN("", info.metric.name, _device_qualifier_sym);
auto _short_desc =
JOIN("", info.metric.name, " on device ", _agent->dev_index);
JOIN("", info.metric.name, " on device ", _dev_index);
_data->emplace_back(component::rocm_info_entry(
true, tim::hardware_counters::api::rocm, _data->size(), 0, _sym,
_pysym, _short_desc, _long_desc, _units,
@@ -331,7 +329,7 @@ info_data_callback(const rocprofiler_info_data_t info, void* arg)
auto _sym = JOIN("", info.metric.name, _instance_qualifier_sym,
_device_qualifier_sym);
auto _short_desc = JOIN("", info.metric.name, " instance ", i,
" on device ", _agent->dev_index);
" on device ", _dev_index);
_data->emplace_back(component::rocm_info_entry(
true, tim::hardware_counters::api::rocm, _data->size(), 0,
_sym, _pysym, _short_desc, _long_desc, _units,
@@ -369,12 +367,26 @@ rocm_metrics()
const AgentInfo** _agent_p = &_agent;
HsaRsrcFactory::Instance().GetGpuAgentInfo(i, _agent_p);
auto _v = info_data{ _agent, &_data };
if(!rocm_check_status(
rocprofiler_iterate_info(&_agent->dev_id, ROCPROFILER_INFO_KIND_METRIC,
info_data_callback, reinterpret_cast<void*>(&_v)),
{ HSA_STATUS_ERROR_NOT_INITIALIZED }))
return _data;
if(!rocm_check_status(rocprofiler_iterate_info(
&_agent->dev_id, ROCPROFILER_INFO_KIND_METRIC,
info_data_callback, reinterpret_cast<void*>(&_data)),
{ HSA_STATUS_ERROR_NOT_INITIALIZED }))
{
OMNITRACE_WARNING_F(-1, "rocprofiler_iterate_info failed for gpu agent %u\n",
i);
}
}
if(gpu_count > 0 && _data.empty())
{
if(!rocm_check_status(rocprofiler_iterate_info(
nullptr, ROCPROFILER_INFO_KIND_METRIC,
info_data_callback, reinterpret_cast<void*>(&_data)),
{ HSA_STATUS_ERROR_NOT_INITIALIZED }))
{
OMNITRACE_WARNING_F(-1, "rocprofiler_iterate_info failed for %i gpu agents\n",
gpu_count);
}
}
auto _settings = tim::settings::shared_instance();