- include/rocprofiler/agent.h
  - move rocprofiler_dim3_t
- include/rocprofiler/buffer_tracing.h
  - size fields
  - update kernel dispatch record
- include/rocprofiler/callback_tracing.h
  - remove rocprofiler_callback_tracing_code_object_unload_data_t
  - remove rocprofiler_callback_tracing_code_object_register_host_kernel_symbol_data_t
- include/rocprofiler/fwd.h
  - added ROCPROFILER_STATUS_ERROR_CONTEXT_CONFLICT
  - remove ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_UNLOAD
  - remove ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_UNREGISTER
  - add rocprofiler_kernel_id_t typedef
  - add rocprofiler_dim3_t (moved from agent.h)
- lib/common/synchronized.hpp
  - rlock/wlock return decltype(auto)
  - separate prototype from definition
- lib/common/utility.{hpp,cpp}
  - timestamp functions replicating HSA implementation(s)
  - init_public_api_struct for setting size field and ensuring certain type traits
  - simplified static_cleanup_wrapper
  - separate prototype from definition in active_capacity_gate
- lib/rocprofiler/agent.cpp
  - tweak get_rocprofiler_agent impl
- lib/rocprofiler/buffer.cpp
  - fix buffer message log level
- lib/rocprofiler/context.cpp
  - use new paradigm for getting active contexts
- lib/rocprofiler/internal_threading.hpp
  - update to simplified static_cleanup_wrapper implementation
- lib/rocprofiler/registration.cpp
  - fix deactivating contexts
- lib/rocprofiler/rocprofiler.cpp
  - status string for context conflict
- lib/rocprofiler/context/context.*
  - correlation_id struct
  - new get_active_contexts paradigm
- lib/rocprofiler/counters/core.*
  - rocprofiler_packet union
  - tweak start/stop context to accept pointer instead of handle
- lib/rocprofiler/counters/dimensions.cpp
  - update to new get_rocp_agent() return type
- lib/rocprofiler/hsa/hsa.*
  - update to new get_active_contexts paradigm
  - update to new correlation id implementation
  - guard against hsa.def.cpp direct compilation
- lib/rocprofiler/hsa/queue_controller.*
  - update to change in get_rocp_agent return type
  - consistent aliases
  - lookup function for getting queue pointer from hsa queue id
- lib/rocprofiler/hsa/queue.*
  - rocprofiler_packet
  - extend queue_info_session_t
- lib/rocprofiler/tests/registration.cpp
  - improve diagnostic on perf check for rocprofiler_lib.callback_registration_lambda_with_result
Этот коммит содержится в:
Jonathan R. Madsen
2023-11-06 21:59:31 -06:00
коммит произвёл GitHub
родитель 63775f241a
Коммит 55f2dabbb3
26 изменённых файлов: 828 добавлений и 325 удалений
-10
Просмотреть файл
@@ -86,16 +86,6 @@ typedef struct rocprofiler_agent_mem_bank_t
uint64_t size_in_bytes; ///< physical memory size of the memory range in bytes
} rocprofiler_agent_mem_bank_t;
/**
* @brief Multi-dimensional struct of data
*/
typedef struct rocprofiler_dim3_t
{
uint32_t x;
uint32_t y;
uint32_t z;
} rocprofiler_dim3_t;
/**
* @brief Agent.
*/
+16 -1
Просмотреть файл
@@ -22,6 +22,7 @@
#pragma once
#include <rocprofiler/agent.h>
#include <rocprofiler/defines.h>
#include <rocprofiler/fwd.h>
@@ -39,6 +40,7 @@ ROCPROFILER_EXTERN_C_INIT
*/
typedef struct
{
uint64_t size;
rocprofiler_service_buffer_tracing_kind_t kind;
rocprofiler_correlation_id_t correlation_id;
rocprofiler_tracing_operation_t operation; // rocprofiler/hsa.h
@@ -52,6 +54,7 @@ typedef struct
*/
typedef struct
{
uint64_t size;
rocprofiler_service_buffer_tracing_kind_t kind;
rocprofiler_correlation_id_t correlation_id;
rocprofiler_tracing_operation_t operation; // rocprofiler/hip.h
@@ -65,6 +68,7 @@ typedef struct
*/
typedef struct
{
uint64_t size;
rocprofiler_service_buffer_tracing_kind_t kind;
rocprofiler_correlation_id_t correlation_id;
rocprofiler_tracing_operation_t operation; // rocprofiler/marker.h
@@ -79,6 +83,7 @@ typedef struct
*/
typedef struct
{
uint64_t size;
rocprofiler_service_buffer_tracing_kind_t kind;
rocprofiler_correlation_id_t correlation_id;
/**
@@ -96,12 +101,18 @@ typedef struct
*/
typedef struct
{
uint64_t size;
rocprofiler_service_buffer_tracing_kind_t kind;
rocprofiler_correlation_id_t correlation_id;
rocprofiler_timestamp_t start_timestamp;
rocprofiler_timestamp_t end_timestamp;
rocprofiler_agent_id_t agent_id;
rocprofiler_queue_id_t queue_id;
const char* kernel_name;
rocprofiler_kernel_id_t kernel_id;
uint32_t private_segment_size;
uint32_t group_segment_size;
rocprofiler_dim3_t workgroup_size;
rocprofiler_dim3_t grid_size;
} rocprofiler_buffer_tracing_kernel_dispatch_record_t;
/**
@@ -109,6 +120,7 @@ typedef struct
*/
typedef struct
{
uint64_t size;
rocprofiler_service_buffer_tracing_kind_t kind;
rocprofiler_correlation_id_t correlation_id;
rocprofiler_timestamp_t start_timestamp;
@@ -122,6 +134,7 @@ typedef struct
*/
typedef struct
{
uint64_t size;
rocprofiler_service_buffer_tracing_kind_t kind;
rocprofiler_correlation_id_t correlation_id;
rocprofiler_timestamp_t start_timestamp;
@@ -135,6 +148,7 @@ typedef struct
*/
typedef struct
{
uint64_t size;
rocprofiler_service_buffer_tracing_kind_t kind;
rocprofiler_correlation_id_t correlation_id;
rocprofiler_timestamp_t start_timestamp;
@@ -198,6 +212,7 @@ typedef struct
*/
typedef struct
{
uint64_t size;
rocprofiler_service_buffer_tracing_kind_t kind;
rocprofiler_correlation_id_t correlation_id;
} rocprofiler_buffer_tracing_correlation_record_t;
+2 -25
Просмотреть файл
@@ -49,6 +49,7 @@ typedef enum
ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_FILE = HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE,
ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_MEMORY =
HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY,
ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_LAST,
} rocprofiler_code_object_storage_type_t;
/**
@@ -124,16 +125,6 @@ typedef struct
};
} rocprofiler_callback_tracing_code_object_load_data_t;
/**
* @brief ROCProfiler Code Object UnLoad Tracer Callback Record.
*
*/
typedef struct
{
uint64_t size; ///< size of this struct
uint64_t code_object_id; ///< unique code object identifier
} rocprofiler_callback_tracing_code_object_unload_data_t;
/**
* @brief ROCProfiler Code Object Kernel Symbol Tracer Callback Record.
*
@@ -154,21 +145,7 @@ typedef struct
///< (per work-group), in bytes
uint32_t private_segment_size; ///< Size of static private, spill, and arg segment memory
///< required by this kernel (per work-item), in bytes.
} rocprofiler_callback_tracing_code_object_kernel_symbol_data_t;
/**
* @brief ROCProfiler Code Object Register Host Kernel Symbol Tracer Callback
* Record.
*
*/
typedef struct
{
uint64_t size; ///< size of this struct
rocprofiler_address_t host_address; // host address
// Should this be nullptr if it is unregister?
const char* kernel_name; // kernel name string (NULL terminated)
rocprofiler_address_t kernel_descriptor; // kernel descriptor
} rocprofiler_callback_tracing_code_object_register_host_kernel_symbol_data_t;
} rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t;
/**
* @brief API Tracing callback function. This function is invoked twice per API function: once
+20 -6
Просмотреть файл
@@ -58,9 +58,10 @@ typedef enum // NOLINT(performance-enum-size)
ROCPROFILER_STATUS_ERROR_COUNTER_NOT_FOUND, ///< Counter identifier does not exist
ROCPROFILER_STATUS_ERROR_CONTEXT_ERROR, ///> Generalized context error
ROCPROFILER_STATUS_ERROR_CONTEXT_INVALID, ///< Context configuration is not valid
ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_STARTED, ///< Context was not started (maybe already
///< started or atomic swap into active array
///< failed)
ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_STARTED, ///< Context was not started (e.g., atomic swap
///< into active array failed)
ROCPROFILER_STATUS_ERROR_CONTEXT_CONFLICT, ///< Context operation failed due to a conflict with
///< another context
ROCPROFILER_STATUS_ERROR_BUFFER_BUSY, ///< buffer operation failed because it currently busy
///< handling another request (e.g. flushing)
ROCPROFILER_STATUS_ERROR_SERVICE_ALREADY_CONFIGURED, ///< service has already been configured
@@ -154,12 +155,9 @@ typedef enum // NOLINT(performance-enum-size)
{
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_NONE = 0,
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_LOAD,
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_UNLOAD,
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER,
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_UNREGISTER,
// next two are part of hipRegisterFunction API.
// ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_HOST_KERNEL_SYMBOL_REGISTER,
// ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_HOST_KERNEL_SYMBOL_UNREGISTER,
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_LAST,
} rocprofiler_callback_tracing_code_object_operation_t;
@@ -251,6 +249,12 @@ typedef uint64_t rocprofiler_thread_id_t;
*/
typedef uint32_t rocprofiler_tracing_operation_t;
/**
* @brief Kernel identifier type
*
*/
typedef uint64_t rocprofiler_kernel_id_t;
// forward declaration of struct
typedef struct rocprofiler_pc_sampling_configuration_s rocprofiler_pc_sampling_configuration_t;
@@ -348,6 +352,16 @@ typedef struct
uint64_t handle;
} rocprofiler_profile_config_id_t;
/**
* @brief Multi-dimensional struct of data used to describe GPU workgroup and grid sizes
*/
typedef struct rocprofiler_dim3_t
{
uint32_t x;
uint32_t y;
uint32_t z;
} rocprofiler_dim3_t;
/**
* @brief Tracing record
*
+74 -44
Просмотреть файл
@@ -75,24 +75,10 @@ public:
Synchronized& operator=(const Synchronized&) = delete;
template <typename FuncT, typename... Args>
auto rlock(FuncT&& lambda, Args&&... args) const
{
static_assert(std::is_invocable<FuncT, const value_type&, Args...>::value,
"function must accept const reference to locked type");
auto lock = std::shared_lock{m_mutex};
return std::forward<FuncT>(lambda)(m_data, std::forward<Args>(args)...);
}
decltype(auto) rlock(FuncT&& lambda, Args&&... args) const;
template <typename FuncT, typename... Args>
auto wlock(FuncT&& lambda, Args&&... args)
{
static_assert(std::is_invocable<FuncT, value_type&, Args...>::value,
"function must accept reference to locked type");
auto lock = std::unique_lock{m_mutex};
return std::forward<FuncT>(lambda)(m_data, std::forward<Args>(args)...);
}
decltype(auto) wlock(FuncT&& lambda, Args&&... args);
// This overload to wlock allows a synchronized map whose keys map to synchronized data to
// use a read lock on the key data and then a write lock on the mapped data.
@@ -100,42 +86,86 @@ public:
typename... Args,
bool EnableForMappedType = IsMappedTypeV,
std::enable_if_t<EnableForMappedType, int> = 0>
auto wlock(FuncT&& lambda, Args&&... args) const
{
return const_cast<this_type*>(this)->wlock(std::forward<FuncT>(lambda),
std::forward<Args>(args)...);
}
decltype(auto) wlock(FuncT&& lambda, Args&&... args) const;
// Upgradable lock. If read returns false, write will be called with a unique_lock.
// Essentially a helper function that does .rlock() followed by .wlock().
template <typename ReadFuncT, typename WriteFuncT, typename... Args>
bool ulock(ReadFuncT&& read, WriteFuncT&& write, Args&&... args)
{
static_assert(std::is_invocable<ReadFuncT, const value_type&, Args...>::value,
"read function must accept const reference to locked type");
static_assert(std::is_invocable<WriteFuncT, value_type&, Args...>::value,
"write function must accept reference to locked type");
using read_return_type = std::invoke_result_t<ReadFuncT, const value_type&, Args...>;
using write_return_type = std::invoke_result_t<WriteFuncT, value_type&, Args...>;
static_assert(std::is_same<read_return_type, write_return_type>::value,
"read and write functions must return same type");
static_assert(std::is_same<read_return_type, bool>::value,
"read/write functions must return bool");
{
auto lock = std::shared_lock{m_mutex};
if(read(m_data, std::forward<Args>(args)...)) return true;
}
auto lock = std::unique_lock{m_mutex};
return write(m_data, std::forward<Args>(args)...);
}
bool ulock(ReadFuncT&& read, WriteFuncT&& write, Args&&... args);
private:
mutable std::shared_mutex m_mutex = {};
value_type m_data = {};
};
//
// member definitions
//
template <typename LockedType, bool IsMappedTypeV>
template <typename FuncT, typename... Args>
decltype(auto)
Synchronized<LockedType, IsMappedTypeV>::rlock(FuncT&& lambda, Args&&... args) const
{
static_assert(std::is_invocable<FuncT, const value_type&, Args...>::value,
"function must accept const reference to locked type");
auto lock = std::shared_lock{m_mutex};
return std::forward<FuncT>(lambda)(m_data, std::forward<Args>(args)...);
}
template <typename LockedType, bool IsMappedTypeV>
template <typename FuncT, typename... Args>
decltype(auto)
Synchronized<LockedType, IsMappedTypeV>::wlock(FuncT&& lambda, Args&&... args)
{
static_assert(std::is_invocable<FuncT, value_type&, Args...>::value,
"function must accept reference to locked type");
auto lock = std::unique_lock{m_mutex};
return std::forward<FuncT>(lambda)(m_data, std::forward<Args>(args)...);
}
// This overload to wlock allows a synchronized map whose keys map to synchronized data to
// use a read lock on the key data and then a write lock on the mapped data.
template <typename LockedType, bool IsMappedTypeV>
template <typename FuncT,
typename... Args,
bool EnableForMappedType,
std::enable_if_t<EnableForMappedType, int>>
decltype(auto)
Synchronized<LockedType, IsMappedTypeV>::wlock(FuncT&& lambda, Args&&... args) const
{
return const_cast<this_type*>(this)->wlock(std::forward<FuncT>(lambda),
std::forward<Args>(args)...);
}
// Upgradable lock. If read returns false, write will be called with a unique_lock.
// Essentially a helper function that does .rlock() followed by .wlock().
template <typename LockedType, bool IsMappedTypeV>
template <typename ReadFuncT, typename WriteFuncT, typename... Args>
bool
Synchronized<LockedType, IsMappedTypeV>::ulock(ReadFuncT&& read, WriteFuncT&& write, Args&&... args)
{
static_assert(std::is_invocable<ReadFuncT, const value_type&, Args...>::value,
"read function must accept const reference to locked type");
static_assert(std::is_invocable<WriteFuncT, value_type&, Args...>::value,
"write function must accept reference to locked type");
using read_return_type = std::invoke_result_t<ReadFuncT, const value_type&, Args...>;
using write_return_type = std::invoke_result_t<WriteFuncT, value_type&, Args...>;
static_assert(std::is_same<read_return_type, write_return_type>::value,
"read and write functions must return same type");
static_assert(std::is_same<read_return_type, bool>::value,
"read/write functions must return bool");
{
auto lock = std::shared_lock{m_mutex};
if(read(m_data, std::forward<Args>(args)...)) return true;
}
auto lock = std::unique_lock{m_mutex};
return write(m_data, std::forward<Args>(args)...);
}
} // namespace common
} // namespace rocprofiler
+84
Просмотреть файл
@@ -21,16 +21,100 @@
#include "lib/common/utility.hpp"
#include <glog/logging.h>
#include <unistd.h>
#include <cerrno>
#include <cstring>
#include <ctime>
#include <fstream>
#include <sstream>
#include <string>
#include <vector>
#include "lib/common/defines.hpp"
namespace rocprofiler
{
namespace common
{
namespace
{
std::string_view
get_clock_name(clockid_t _id)
{
#define CLOCK_NAME_CASE_STATEMENT(NAME) \
case NAME: return #NAME;
switch(_id)
{
CLOCK_NAME_CASE_STATEMENT(CLOCK_REALTIME)
CLOCK_NAME_CASE_STATEMENT(CLOCK_MONOTONIC)
CLOCK_NAME_CASE_STATEMENT(CLOCK_PROCESS_CPUTIME_ID)
CLOCK_NAME_CASE_STATEMENT(CLOCK_THREAD_CPUTIME_ID)
CLOCK_NAME_CASE_STATEMENT(CLOCK_MONOTONIC_RAW)
CLOCK_NAME_CASE_STATEMENT(CLOCK_REALTIME_COARSE)
CLOCK_NAME_CASE_STATEMENT(CLOCK_MONOTONIC_COARSE)
CLOCK_NAME_CASE_STATEMENT(CLOCK_BOOTTIME)
CLOCK_NAME_CASE_STATEMENT(CLOCK_REALTIME_ALARM)
CLOCK_NAME_CASE_STATEMENT(CLOCK_BOOTTIME_ALARM)
CLOCK_NAME_CASE_STATEMENT(CLOCK_TAI)
default: break;
}
return "CLOCK_UNKNOWN";
}
} // namespace
clockid_t
get_accurate_clock_id_impl()
{
auto clock = CLOCK_MONOTONIC;
utsname kernelInfo;
if(uname(&kernelInfo) == 0)
{
try
{
std::string ver = kernelInfo.release;
size_t idx;
int major = std::stoi(ver, &idx);
int minor = std::stoi(ver.substr(idx + 1));
if(major > 4 || ((major == 4) && (minor >= 4)))
{
clock = CLOCK_MONOTONIC_RAW;
}
} catch(...)
{
// Kernel version string doesn't conform to the standard pattern.
// Keep using the "safe" (non-RAW) clock.
}
}
return clock;
}
uint64_t
get_clock_freq_ns_impl(clockid_t _clk_id)
{
constexpr auto nanosec = std::nano::den;
struct timespec ts;
auto ret = clock_getres(_clk_id, &ts);
if(ROCPROFILER_UNLIKELY(ret != 0))
{
auto _err = errno;
LOG(FATAL) << "error getting clock resolution for " << get_clock_name(_clk_id) << ": "
<< strerror(_err);
}
else if(ROCPROFILER_UNLIKELY(ts.tv_sec != 0 ||
ts.tv_nsec >= std::numeric_limits<uint32_t>::max()))
{
LOG(FATAL) << "clock_getres(" << get_clock_name(_clk_id)
<< ") returned very low frequency (<1Hz)";
}
auto&& _period =
(static_cast<uint64_t>(ts.tv_sec) * nanosec) + static_cast<uint64_t>(ts.tv_nsec);
return nanosec / _period;
}
std::vector<std::string>
read_command_line(pid_t _pid)
{
+164 -47
Просмотреть файл
@@ -22,19 +22,37 @@
#pragma once
#include "lib/common/defines.hpp"
#include <glog/logging.h>
#include <sys/syscall.h>
#include <sys/utsname.h>
#include <unistd.h>
#include <chrono>
#include <condition_variable>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <ctime>
#include <functional>
#include <mutex>
#include <ratio>
#include <stdexcept>
#include <string>
#include <type_traits>
#include <vector>
namespace rocprofiler
{
namespace common
{
clockid_t
get_accurate_clock_id_impl();
uint64_t
get_clock_freq_ns_impl(clockid_t _clk_id);
inline uint64_t
get_tid()
{
@@ -43,11 +61,51 @@ get_tid()
return _v;
}
inline clockid_t
get_accurate_clock_id()
{
static auto clk_id = get_accurate_clock_id_impl();
return clk_id;
}
inline uint64_t
get_accurate_clock_freq_ns()
{
static auto clk_freq = get_clock_freq_ns_impl(get_accurate_clock_id());
return clk_freq;
}
inline uint64_t
get_ticks(clockid_t clk_id_v) noexcept
{
constexpr auto nanosec = std::nano::den;
auto&& ts = timespec{};
auto ret = clock_gettime(clk_id_v, &ts);
if(ROCPROFILER_UNLIKELY(ret != 0))
{
auto _err = errno;
LOG(FATAL) << "clock_gettime failed: " << strerror(_err);
}
return (static_cast<uint64_t>(ts.tv_sec) * nanosec) + static_cast<uint64_t>(ts.tv_nsec);
}
// this equates to HSA-runtime library implementation of os::ReadAccurateClock()
inline uint64_t
timestamp_ns()
{
// TODO(jrmadsen): this should be updated to the HSA method
return std::chrono::steady_clock::now().time_since_epoch().count();
return get_ticks(get_accurate_clock_id()) * get_accurate_clock_freq_ns();
}
// this equates to HSA-runtime library implementation of os::ReadSystemClock()
inline uint64_t
system_timestamp_ns()
{
constexpr auto boottime_clk = CLOCK_BOOTTIME;
static auto boottime_clk_freq = get_clock_freq_ns_impl(boottime_clk);
return get_ticks(boottime_clk) * boottime_clk_freq;
}
std::vector<std::string>
@@ -69,34 +127,85 @@ get_val(Container& map, const Key& key)
return (pos != map.end() ? &pos->second : nullptr);
}
template <typename Tp>
constexpr void
assert_public_api_struct_properties()
{
static_assert(std::is_class<Tp>::value, "this is not a public API struct");
static_assert(std::is_standard_layout<Tp>::value,
"public API struct should have a standard layout");
static_assert(std::is_trivially_default_constructible<Tp>::value,
"public API struct should be trivially default constructible");
static_assert(std::is_trivially_copy_constructible<Tp>::value,
"public API struct should be trivially copy constructible");
static_assert(std::is_trivially_move_constructible<Tp>::value,
"public API struct should be trivially move constructible");
static_assert(std::is_trivially_copy_assignable<Tp>::value,
"public API struct should be trivially move assignable");
static_assert(std::is_trivially_move_assignable<Tp>::value,
"public API struct should be trivially move assignable");
static_assert(std::is_trivially_copyable<Tp>::value,
"public API struct should be trivially move assignable");
static_assert(std::is_trivial<Tp>::value, "public API struct should be trivial");
static_assert(offsetof(Tp, size) == 0, "public API struct should have a size field first");
static_assert(sizeof(std::declval<Tp>().size) == sizeof(uint64_t),
"public API struct size field should be 64 bits");
}
template <typename Tp>
decltype(auto)
init_public_api_struct(Tp&& val)
{
assert_public_api_struct_properties<Tp>();
::memset(&val, 0, sizeof(Tp));
val.size = sizeof(Tp);
return std::forward<Tp>(val);
}
template <typename Tp>
Tp&
init_public_api_struct(Tp& val)
{
assert_public_api_struct_properties<Tp>();
::memset(&val, 0, sizeof(Tp));
val.size = sizeof(Tp);
return val;
}
/**
* A simple wrapper that will call a function when the
* wrapper is being destroyed. This is primarily useful
* for static variables where we want to run some destruction
* operations when the program exits.
*/
template <typename T, typename L>
template <typename Tp>
class static_cleanup_wrapper
{
public:
static_cleanup_wrapper(T&& data, L&& destroy_func)
: _data(std::move(data))
, _destroy_func(destroy_func)
using data_type = Tp;
using functor_type = std::function<void(Tp&)>;
static_cleanup_wrapper(data_type&& data, functor_type&& destroy_func)
: m_data(std::move(data))
, m_destroy_func(std::move(destroy_func))
{}
static_cleanup_wrapper(L&& destroy_func)
: _destroy_func(destroy_func)
static_cleanup_wrapper(functor_type&& destroy_func)
: m_destroy_func(std::move(destroy_func))
{}
~static_cleanup_wrapper() { _destroy_func(_data); }
~static_cleanup_wrapper() { m_destroy_func(m_data); }
void destroy() { _destroy_func(_data); }
void destroy() { m_destroy_func(m_data); }
T& get() { return _data; }
data_type& get() { return m_data; }
const data_type& get() const { return m_data; }
private:
T _data;
L _destroy_func;
data_type m_data = {};
functor_type m_destroy_func = {};
};
/**
@@ -107,41 +216,10 @@ private:
class active_capacity_gate
{
public:
active_capacity_gate(size_t capacity)
: _capacity(capacity)
{}
void add_active(size_t size)
{
if(size >= _capacity)
{
throw std::runtime_error("Size exceeds gate capacity");
}
active_capacity_gate(size_t capacity);
std::unique_lock lock(_m);
if(_count + size < _capacity)
{
_count += size;
return;
}
_waiters++;
_cv.wait(lock, [&]() { return _count + size < _capacity; });
_waiters--;
_count += size;
}
void remove_active(size_t size)
{
std::unique_lock lock(_m);
if(_count > size)
_count -= size;
else
_count = 0;
if(_waiters > 0)
{
_cv.notify_all();
}
}
void add_active(size_t size);
void remove_active(size_t size);
private:
size_t _count{0};
@@ -151,5 +229,44 @@ private:
std::condition_variable _cv;
};
inline active_capacity_gate::active_capacity_gate(size_t capacity)
: _capacity(capacity)
{}
inline void
active_capacity_gate::add_active(size_t size)
{
if(size >= _capacity)
{
throw std::runtime_error("Size exceeds gate capacity");
}
std::unique_lock lock(_m);
if(_count + size < _capacity)
{
_count += size;
return;
}
_waiters++;
_cv.wait(lock, [&]() { return _count + size < _capacity; });
_waiters--;
_count += size;
}
inline void
active_capacity_gate::remove_active(size_t size)
{
std::unique_lock lock(_m);
if(_count > size)
_count -= size;
else
_count = 0;
if(_waiters > 0)
{
_cv.notify_all();
}
}
} // namespace common
} // namespace rocprofiler
+1 -1
Просмотреть файл
@@ -786,7 +786,7 @@ get_rocprofiler_agent(hsa_agent_t agent)
{
for(const auto& itr : get_agent_caches())
{
if(itr == agent) return &itr.get_rocp_agent();
if(itr == agent) return itr.get_rocp_agent();
}
return nullptr;
+1 -1
Просмотреть файл
@@ -143,7 +143,7 @@ flush(rocprofiler_buffer_id_t buffer_id, bool wait)
}
else
{
LOG(ERROR) << "buffer at " << buffer_id.handle << " is empty...";
LOG(INFO) << "buffer at " << buffer_id.handle << " is empty...";
}
buff_v->syncer.clear();
+3 -3
Просмотреть файл
@@ -83,10 +83,10 @@ rocprofiler_context_is_active(rocprofiler_context_id_t context_id, int* status)
if(context_id.handle == rocprofiler_context_none.handle)
return ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_FOUND;
for(const auto& itr : rocprofiler::context::get_active_contexts())
auto ctxs = std::vector<const rocprofiler::context::context*>{};
for(const auto* itr : rocprofiler::context::get_active_contexts(ctxs))
{
const auto* cfg = itr.load(std::memory_order_relaxed);
if(cfg && cfg->context_idx == context_id.handle)
if(itr && itr->context_idx == context_id.handle)
{
*status = 1;
return ROCPROFILER_STATUS_SUCCESS;
+146 -28
Просмотреть файл
@@ -20,10 +20,13 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <rocprofiler/buffer_tracing.h>
#include <rocprofiler/fwd.h>
#include <rocprofiler/rocprofiler.h>
#include "lib/common/container/stable_vector.hpp"
#include "lib/common/synchronized.hpp"
#include "lib/common/utility.hpp"
#include "lib/rocprofiler/buffer.hpp"
#include "lib/rocprofiler/context/context.hpp"
#include "lib/rocprofiler/counters/core.hpp"
@@ -33,6 +36,7 @@
#include <unistd.h>
#include <atomic>
#include <cstddef>
#include <deque>
#include <memory>
#include <mutex>
#include <optional>
@@ -43,6 +47,8 @@ namespace context
{
namespace
{
using reserve_size_t = common::container::reserve_size;
auto&
get_contexts_mutex()
{
@@ -58,26 +64,16 @@ get_client_index()
static auto _v = invalid_client_idx;
return _v;
}
} // namespace
uint64_t
correlation_tracing_service::get_unique_internal_id()
auto&
get_num_active_contexts()
{
static auto _v = std::atomic<uint64_t>{};
return _v++;
}
using reserve_size_t = common::container::reserve_size;
unique_context_vec_t&
get_registered_contexts()
{
static auto _v = unique_context_vec_t{reserve_size_t{unique_context_vec_t::chunk_size}};
static auto _v = std::atomic<int64_t>{0};
return _v;
}
active_context_vec_t&
get_active_contexts()
get_active_contexts_impl()
{
static auto* _v = new active_context_vec_t{reserve_size_t{active_context_vec_t::chunk_size}};
static auto _once = std::once_flag{};
@@ -90,6 +86,97 @@ get_active_contexts()
return *_v;
}
auto&
get_correlation_id_map()
{
static auto _v = common::Synchronized<std::vector<std::unique_ptr<correlation_id>>>{};
return _v;
}
auto*&
get_latest_correlation_id_impl()
{
static thread_local correlation_id* _v = nullptr;
return _v;
}
uint64_t
get_unique_internal_id()
{
static auto _v = std::atomic<uint64_t>{};
return ++_v;
}
} // namespace
correlation_id*
correlation_tracing_service::construct(uint32_t _init_ref_count)
{
auto _internal_id = get_unique_internal_id();
auto& corr_id_map = get_correlation_id_map();
auto& ret = corr_id_map.wlock([](auto& data) -> auto& { return data.emplace_back(); });
ret = std::make_unique<correlation_id>(_init_ref_count, common::get_tid(), _internal_id);
get_latest_correlation_id_impl() = ret.get();
return ret.get();
}
correlation_id*
get_latest_correlation_id()
{
return get_latest_correlation_id_impl();
}
void
pop_latest_correlation_id(const correlation_id* val)
{
if(get_latest_correlation_id_impl() == val) get_latest_correlation_id_impl() = nullptr;
}
unique_context_vec_t&
get_registered_contexts()
{
static auto _v = unique_context_vec_t{reserve_size_t{unique_context_vec_t::chunk_size}};
return _v;
}
std::vector<const context*>&
get_active_contexts(std::vector<const context*>& data, context_filter_t filter)
{
data.clear();
auto num_ctx = get_num_active_contexts().load(std::memory_order_acquire);
if(num_ctx <= 0) return data;
data.reserve(num_ctx);
for(auto& itr : get_active_contexts_impl())
{
const auto* ctx = itr.load(std::memory_order_acquire);
if(ctx)
{
if(!filter || (filter && filter(ctx))) data.emplace_back(ctx);
}
if(static_cast<int64_t>(data.size()) == num_ctx)
{
// if the number of active contexts changed, restart
if(num_ctx != get_num_active_contexts().load(std::memory_order_relaxed))
{
data.clear();
return get_active_contexts(data, filter);
}
break;
}
}
return data;
}
std::vector<const context*>
get_active_contexts(context_filter_t filter)
{
auto data = std::vector<const context*>{};
get_active_contexts(data, filter);
return data;
}
// set the client index needs to be called before allocate_context()
void
push_client(uint32_t value)
@@ -173,15 +260,30 @@ start_context(rocprofiler_context_id_t context_id)
return ROCPROFILER_STATUS_ERROR_CONTEXT_INVALID;
}
auto current_contexts = std::vector<const context*>{};
for(const auto* itr : get_active_contexts(current_contexts))
{
if(cfg->context_idx == itr->context_idx)
{
return ROCPROFILER_STATUS_SUCCESS;
}
else if(cfg->counter_collection && itr->counter_collection)
{
// conflicting context
return ROCPROFILER_STATUS_ERROR_CONTEXT_CONFLICT;
}
}
uint64_t rocp_tot_contexts = get_registered_contexts().size();
auto idx = rocp_tot_contexts;
auto& active_contexts = get_active_contexts_impl();
{
// hold a lock here so prevent multiple threads from finding the same nullptr slot
// hold a lock here to prevent multiple threads from finding the same nullptr slot
auto _lk = std::unique_lock<std::mutex>{get_contexts_mutex()};
// try to find a nullptr slot first
for(size_t i = 0; i < get_active_contexts().size(); ++i)
for(size_t i = 0; i < active_contexts.size(); ++i)
{
const auto* itr = get_active_contexts().at(i).load(std::memory_order_relaxed);
const auto* itr = active_contexts.at(i).load(std::memory_order_relaxed);
if(itr == nullptr)
{
idx = i;
@@ -195,19 +297,25 @@ start_context(rocprofiler_context_id_t context_id)
// if no nullptr slot was found, then create one while lock is held
if(idx == rocp_tot_contexts)
{
idx = get_active_contexts().size();
get_active_contexts().emplace_back();
idx = active_contexts.size();
active_contexts.emplace_back();
}
get_num_active_contexts().fetch_add(1, std::memory_order_release);
}
// atomic swap the pointer into the "active" array used internally
const context* _expected = nullptr;
bool success = get_active_contexts().at(idx).compare_exchange_strong(
bool success = active_contexts.at(idx).compare_exchange_strong(
_expected, get_registered_contexts().at(context_id.handle).get());
if(!success) return ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_STARTED;
if(!success)
{
get_num_active_contexts().fetch_sub(1, std::memory_order_release);
return ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_STARTED;
}
rocprofiler::counters::start_context(context_id);
if(cfg->counter_collection) rocprofiler::counters::start_context(cfg);
return ROCPROFILER_STATUS_SUCCESS;
}
@@ -215,17 +323,27 @@ start_context(rocprofiler_context_id_t context_id)
rocprofiler_status_t
stop_context(rocprofiler_context_id_t idx)
{
// hold a lock here to prevent other thread from changing the active contexts array
auto _lk = std::unique_lock<std::mutex>{get_contexts_mutex()};
// atomically assign the context pointer to NULL so that it is skipped in future
// callbacks
for(auto& itr : get_active_contexts())
for(auto& itr : get_active_contexts_impl())
{
const auto* _expected = itr.load(std::memory_order_relaxed);
const context* _expected = itr.load(std::memory_order_acquire);
if(_expected && _expected->context_idx == idx.handle)
{
bool success = itr.compare_exchange_strong(_expected, nullptr);
rocprofiler::counters::stop_context(idx);
if(success) return ROCPROFILER_STATUS_SUCCESS;
if(success)
{
auto nactive = get_num_active_contexts().load(std::memory_order_acquire);
if(nactive > 0) get_num_active_contexts().fetch_sub(1, std::memory_order_release);
if(_expected->counter_collection)
rocprofiler::counters::stop_context(const_cast<context*>(_expected));
return ROCPROFILER_STATUS_SUCCESS;
}
}
}
@@ -235,10 +353,10 @@ stop_context(rocprofiler_context_id_t idx)
void
deactivate_client_contexts(rocprofiler_client_id_t client_id)
{
for(auto& itr : get_active_contexts())
for(auto& itr : get_active_contexts_impl())
{
const auto* itr_v = itr.load();
if(itr_v->client_idx == client_id.handle)
if(itr_v && itr_v->client_idx == client_id.handle)
{
itr.store(nullptr);
}
+56 -3
Просмотреть файл
@@ -46,12 +46,53 @@ using external_cid_cb_t = uint64_t (*)(rocprofiler_service_callback_tracing_kind
uint32_t,
uint64_t);
constexpr auto null_user_data = rocprofiler_user_data_t{.value = 0};
struct correlation_id
{
// reference count starts at 5:
// - decrement after begin callback/buffer API
// - decrement after end callback/buffer API
// - decrement after kernel dispatch/HW counters
// - if PC sampling is not enabled, we can "retire" correlation id at ref count at 2
// - if PC sampling is enabled, we decrement after each HSA buffer flush once ref count hits 2
// - after the kernel dispatch completes, we know no more PC samples will be generated and
// thus, after two HSA buffer flushes, we will have received all the PC samples for
// the
correlation_id(uint32_t _cnt, rocprofiler_thread_id_t _tid, uint64_t _internal) noexcept
: ref_count{_cnt}
, thread_idx{_tid}
, internal{_internal}
{}
correlation_id() = default;
~correlation_id() = default;
correlation_id(correlation_id&& val) noexcept = delete;
correlation_id(const correlation_id&) = delete;
correlation_id& operator=(const correlation_id&) = delete;
correlation_id& operator=(correlation_id&&) noexcept = delete;
std::atomic<uint32_t> ref_count = {};
rocprofiler_thread_id_t thread_idx = 0;
uint64_t internal = 0;
};
correlation_id*
get_correlation_id(rocprofiler_thread_id_t tid, uint64_t internal_id);
// latest correlation id for thread
correlation_id*
get_latest_correlation_id();
void
pop_latest_correlation_id(const correlation_id*);
/// permits tools opportunity to modify the correlation id based on the domain, op, and
/// the rocprofiler generated correlation id
struct correlation_tracing_service
{
external_correlation::external_correlation external_correlator = {};
static uint64_t get_unique_internal_id();
static correlation_id* construct(uint32_t init_ref_count);
};
struct callback_tracing_service
@@ -139,8 +180,20 @@ using active_context_vec_t = common::container::stable_vector<std::atomic<const
unique_context_vec_t&
get_registered_contexts();
active_context_vec_t&
get_active_contexts();
using context_filter_t = bool (*)(const context*);
inline bool
default_context_filter(const context* val)
{
return (val != nullptr);
}
std::vector<const context*>&
get_active_contexts(std::vector<const context*>& data,
context_filter_t filter = default_context_filter);
std::vector<const context*>
get_active_contexts(context_filter_t filter = default_context_filter);
void deactivate_client_contexts(rocprofiler_client_id_t);
+19 -21
Просмотреть файл
@@ -22,7 +22,7 @@ std::unique_ptr<rocprofiler::hsa::AQLPacket>
queue_cb(const std::shared_ptr<rocprofiler::counters::counter_callback_info>& info,
const hsa::Queue& queue,
hsa::ClientID,
const hsa_ext_amd_aql_pm4_packet_t&)
hsa::rocprofiler_packet)
{
if(!info) return nullptr;
@@ -95,7 +95,7 @@ void
completed_cb(const std::shared_ptr<rocprofiler::counters::counter_callback_info>& info,
const hsa::Queue& queue,
hsa::ClientID,
const hsa_ext_amd_aql_pm4_packet_t& kernel,
hsa::rocprofiler_packet kernel,
std::unique_ptr<rocprofiler::hsa::AQLPacket> pkt)
{
if(!info) return;
@@ -127,7 +127,7 @@ completed_cb(const std::shared_ptr<rocprofiler::counters::counter_callback_info>
info->user_cb(queue.get_id(),
info->profile_cfg.agent,
rocprofiler_correlation_id_t{},
reinterpret_cast<const hsa_kernel_dispatch_packet_t*>(&kernel),
&kernel.kernel_dispatch,
info->callback_args,
out.data(), // Date pointer does here.
out.size(), // Number of objects
@@ -215,31 +215,29 @@ destroy_counter_profile(uint64_t id)
}
void
start_context(rocprofiler_context_id_t context_id)
start_context(context::context* ctx)
{
auto& ctx = *rocprofiler::context::get_registered_contexts().at(context_id.handle);
if(!ctx || !ctx->counter_collection) return;
auto& controller = hsa::get_queue_controller();
if(!ctx.counter_collection) return;
// Only one thread should be attempting to enable/disable this context
ctx.counter_collection->enabled.wlock([&](auto& enabled) {
ctx->counter_collection->enabled.wlock([&](auto& enabled) {
if(enabled) return;
for(auto& cb : ctx.counter_collection->callbacks)
for(auto& cb : ctx->counter_collection->callbacks)
{
// Insert our callbacks into HSA Interceptor. This
// turns on counter instrumentation.
cb->queue_id = controller.add_callback(
cb->profile_cfg.agent,
[=](const hsa::Queue& q,
hsa::ClientID c,
const hsa_ext_amd_aql_pm4_packet_t& kern_pkt) {
[=](const hsa::Queue& q, hsa::ClientID c, hsa::rocprofiler_packet kern_pkt) {
return queue_cb(cb, q, c, kern_pkt);
},
// Completion CB
[=](const hsa::Queue& q,
hsa::ClientID c,
const hsa_ext_amd_aql_pm4_packet_t& kern_pkt,
std::unique_ptr<hsa::AQLPacket> aql) {
[=](const hsa::Queue& q,
hsa::ClientID c,
hsa::rocprofiler_packet kern_pkt,
std::unique_ptr<hsa::AQLPacket> aql) {
completed_cb(cb, q, c, kern_pkt, std::move(aql));
});
}
@@ -248,15 +246,15 @@ start_context(rocprofiler_context_id_t context_id)
}
void
stop_context(rocprofiler_context_id_t context_id)
stop_context(context::context* ctx)
{
auto& controller = hsa::get_queue_controller();
auto& ctx = *rocprofiler::context::get_registered_contexts().at(context_id.handle);
if(!ctx.counter_collection) return;
if(!ctx || !ctx->counter_collection) return;
ctx.counter_collection->enabled.wlock([&](auto& enabled) {
auto& controller = hsa::get_queue_controller();
ctx->counter_collection->enabled.wlock([&](auto& enabled) {
if(!enabled) return;
for(auto& cb : ctx.counter_collection->callbacks)
for(auto& cb : ctx->counter_collection->callbacks)
{
// Remove our callbacks from HSA's queue controller
controller.remove_callback(cb->queue_id);
+19 -9
Просмотреть файл
@@ -11,6 +11,10 @@
namespace rocprofiler
{
namespace context
{
struct context;
}
namespace counters
{
// Stores counter profiling information such as the agent
@@ -55,15 +59,21 @@ struct counter_callback_info
};
uint64_t
create_counter_profile(profile_config&& config);
void destroy_counter_profile(uint64_t);
bool
configure_dispatch(rocprofiler_context_id_t context_id,
uint64_t profile_id,
rocprofiler_profile_counting_dispatch_callback_t callback,
void* callback_args);
void start_context(rocprofiler_context_id_t);
create_counter_profile(profile_config&& config);
void stop_context(rocprofiler_context_id_t);
void
destroy_counter_profile(uint64_t id);
bool
configure_dispatch(rocprofiler_context_id_t context_id,
uint64_t profile_id,
rocprofiler_profile_counting_dispatch_callback_t callback,
void* callback_args);
void
start_context(context::context*);
void
stop_context(context::context*);
} // namespace counters
} // namespace rocprofiler
+5 -5
Просмотреть файл
@@ -84,16 +84,16 @@ getBlockDimensions(const std::string& agent, const Metric& metric)
{
return std::vector<MetricDimension>{
{dimension_map().at(ROCPROFILER_DIMENSION_SHADER_ENGINE),
maybe_agent.get_rocp_agent().num_shader_banks,
maybe_agent.get_rocp_agent()->num_shader_banks,
ROCPROFILER_DIMENSION_SHADER_ENGINE},
{dimension_map().at(ROCPROFILER_DIMENSION_XCC),
maybe_agent.get_rocp_agent().num_xcc,
maybe_agent.get_rocp_agent()->num_xcc,
ROCPROFILER_DIMENSION_XCC},
{dimension_map().at(ROCPROFILER_DIMENSION_CU),
maybe_agent.get_rocp_agent().cu_count,
maybe_agent.get_rocp_agent()->cu_count,
ROCPROFILER_DIMENSION_CU},
{dimension_map().at(ROCPROFILER_DIMENSION_AGENT),
maybe_agent.get_rocp_agent().id.handle,
maybe_agent.get_rocp_agent()->id.handle,
ROCPROFILER_DIMENSION_AGENT}};
// auto query_info = aql::get_query_info(maybe_agent.get_agent(), metric);
@@ -106,4 +106,4 @@ getBlockDimensions(const std::string& agent, const Metric& metric)
}
} // namespace counters
} // namespace rocprofiler
} // namespace rocprofiler
+1 -1
Просмотреть файл
@@ -72,7 +72,7 @@ public:
CONST_NONCONST_ACCESSOR(hsa_agent_t, get_hsa_agent, m_hsa_agent);
CONST_NONCONST_ACCESSOR(hsa_agent_t, near_cpu, m_nearest_cpu);
const rocprofiler_agent_t& get_rocp_agent() const { return *m_rocp_agent; }
const rocprofiler_agent_t* get_rocp_agent() const { return m_rocp_agent; }
std::string_view name() const { return m_name; }
size_t index() const { return m_index; }
+33 -16
Просмотреть файл
@@ -175,8 +175,6 @@ hsa_api_impl<Idx>::functor(Args&&... args)
{
using info_type = hsa_api_info<Idx>;
LOG(INFO) << __PRETTY_FUNCTION__;
struct callback_context_data
{
const context::context* ctx = nullptr;
@@ -190,14 +188,17 @@ hsa_api_impl<Idx>::functor(Args&&... args)
rocprofiler_user_data_t external_correlation = {};
};
auto thr_id = common::get_tid();
auto callback_contexts = std::vector<callback_context_data>{};
auto buffered_contexts = std::vector<buffered_context_data>{};
for(const auto& aitr : context::get_active_contexts())
static thread_local auto active_contexts = std::vector<const context::context*>{};
auto thr_id = common::get_tid();
auto callback_contexts = std::vector<callback_context_data>{};
auto buffered_contexts = std::vector<buffered_context_data>{};
auto has_pc_sampling = false;
for(const auto* itr : context::get_active_contexts(active_contexts))
{
const auto* itr = aitr.load();
if(!itr) continue;
// if(itr->pc_sampler) has_pc_sampling = true;
if(itr->callback_tracer)
{
// if the given domain + op is not enabled, skip this context
@@ -226,10 +227,16 @@ hsa_api_impl<Idx>::functor(Args&&... args)
return HSA_STATUS_SUCCESS;
}
constexpr auto empty_user_data = rocprofiler_user_data_t{.value = 0};
auto buffer_record = rocprofiler_buffer_tracing_hsa_api_record_t{};
auto tracer_data = rocprofiler_callback_tracing_hsa_api_data_t{};
auto internal_corr_id = context::correlation_tracing_service::get_unique_internal_id();
using correlation_service = context::correlation_tracing_service;
using buffer_hsa_api_record_t = rocprofiler_buffer_tracing_hsa_api_record_t;
using callback_hsa_api_data_t = rocprofiler_callback_tracing_hsa_api_data_t;
constexpr auto empty_user_data = rocprofiler_user_data_t{.value = 0};
auto ref_count = (has_pc_sampling) ? 4 : 2;
auto buffer_record = common::init_public_api_struct(buffer_hsa_api_record_t{});
auto tracer_data = common::init_public_api_struct(callback_hsa_api_data_t{});
auto* corr_id = correlation_service::construct(ref_count);
auto internal_corr_id = corr_id->internal;
// construct the buffered info before the callback so the callbacks are as closely wrapped
// around the function call as possible
@@ -246,7 +253,6 @@ hsa_api_impl<Idx>::functor(Args&&... args)
// invoke the callbacks
if(!callback_contexts.empty())
{
tracer_data.size = sizeof(rocprofiler_callback_tracing_hsa_api_data_t);
set_data_args(info_type::get_api_data_args(tracer_data.args), std::forward<Args>(args)...);
for(auto& itr : callback_contexts)
@@ -255,12 +261,13 @@ hsa_api_impl<Idx>::functor(Args&&... args)
auto& record = itr.record;
auto& user_data = itr.user_data;
auto corr_id = rocprofiler_correlation_id_t{
internal_corr_id, ctx->correlation_tracer.external_correlator.get(thr_id)};
auto extern_corr_id_v = ctx->correlation_tracer.external_correlator.get(thr_id);
auto corr_id_v = rocprofiler_correlation_id_t{internal_corr_id, extern_corr_id_v};
record =
rocprofiler_callback_tracing_record_t{rocprofiler_context_id_t{ctx->context_idx},
thr_id,
corr_id,
corr_id_v,
info_type::callback_domain_idx,
info_type::operation_idx,
ROCPROFILER_CALLBACK_PHASE_ENTER,
@@ -287,6 +294,9 @@ hsa_api_impl<Idx>::functor(Args&&... args)
buffer_record.start_timestamp = common::timestamp_ns();
}
// decrement the reference count before invoking
corr_id->ref_count.fetch_sub(1);
auto _ret = exec(info_type::get_table_func(), std::forward<Args>(args)...);
// record the end timestamp as close to the function call as possible
@@ -333,13 +343,18 @@ hsa_api_impl<Idx>::functor(Args&&... args)
bitr->emplace(ROCPROFILER_BUFFER_CATEGORY_TRACING,
info_type::buffered_domain_idx,
buffer_record);
record_v);
break;
}
}
}
}
// decrement the reference count after usage in the callback/buffers
corr_id->ref_count.fetch_sub(1);
context::pop_latest_correlation_id(corr_id);
if constexpr(!std::is_same<decltype(_ret), null_type>::value)
return _ret;
else
@@ -348,6 +363,8 @@ hsa_api_impl<Idx>::functor(Args&&... args)
} // namespace hsa
} // namespace rocprofiler
#define ROCPROFILER_LIB_ROCPROFILER_HSA_HSA_CPP_IMPL 1
// template specializations
#include "hsa.def.cpp"
+11 -4
Просмотреть файл
@@ -18,10 +18,13 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include <rocprofiler/hsa.h>
#if defined(ROCPROFILER_LIB_ROCPROFILER_HSA_HSA_CPP_IMPL) && \
ROCPROFILER_LIB_ROCPROFILER_HSA_HSA_CPP_IMPL == 1
#include "defines.hpp"
#include "hsa.hpp"
# include <rocprofiler/hsa.h>
# include "defines.hpp"
# include "hsa.hpp"
// clang-format off
HSA_API_TABLE_LOOKUP_DEFINITION(ROCPROFILER_HSA_API_TABLE_ID_CoreApi, core_)
@@ -223,7 +226,7 @@ HSA_API_INFO_DEFINITION_V(ROCPROFILER_HSA_API_TABLE_ID_AmdExt, ROCPROFILER_HSA_A
HSA_API_INFO_DEFINITION_V(ROCPROFILER_HSA_API_TABLE_ID_AmdExt, ROCPROFILER_HSA_API_ID_hsa_amd_runtime_queue_create_register, hsa_amd_runtime_queue_create_register, hsa_amd_runtime_queue_create_register_fn, callback, user_data)
// clang-format on
#if HSA_AMD_EXT_API_TABLE_MAJOR_VERSION >= 0x02
# if HSA_AMD_EXT_API_TABLE_MAJOR_VERSION >= 0x02
HSA_API_INFO_DEFINITION_V(ROCPROFILER_HSA_API_TABLE_ID_AmdExt,
ROCPROFILER_HSA_API_ID_hsa_amd_vmem_address_reserve,
hsa_amd_vmem_address_reserve,
@@ -308,4 +311,8 @@ HSA_API_INFO_DEFINITION_V(ROCPROFILER_HSA_API_TABLE_ID_AmdExt,
alloc_handle,
pool,
type)
# endif
#else
# error "Do not compile this file directly. It is included by lib/rocprofiler/hsa/hsa.cpp"
#endif
+66 -59
Просмотреть файл
@@ -19,8 +19,15 @@
THE SOFTWARE. */
#include "lib/rocprofiler/hsa/queue.hpp"
#include "lib/common/utility.hpp"
#include "lib/rocprofiler/buffer.hpp"
#include "lib/rocprofiler/context/context.hpp"
#include <glog/logging.h>
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>
#include <rocprofiler/fwd.h>
#include <atomic>
#include <chrono>
#include <thread>
@@ -76,11 +83,11 @@ AsyncSignalHandler(hsa_signal_value_t, void* data)
queue_info_session.queue.core_api().hsa_signal_destroy_fn(
queue_info_session.interrupt_signal);
}
if(queue_info_session.kernel_pkt.completion_signal.handle != 0u)
if(queue_info_session.kernel_pkt.ext_amd_aql_pm4.completion_signal.handle != 0u)
{
signals_to_remove++;
queue_info_session.queue.core_api().hsa_signal_destroy_fn(
queue_info_session.kernel_pkt.completion_signal);
queue_info_session.kernel_pkt.ext_amd_aql_pm4.completion_signal);
}
if(signals_to_remove > 0)
{
@@ -92,25 +99,6 @@ AsyncSignalHandler(hsa_signal_value_t, void* data)
return false;
}
void
CreateBarrierPacket(const hsa_signal_t& packet_completion_signal,
std::vector<hsa_ext_amd_aql_pm4_packet_t>& transformed_packets)
{
hsa_barrier_and_packet_t barrier{};
barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
barrier.dep_signal[0] = packet_completion_signal;
void* barrier_ptr = &barrier;
transformed_packets.emplace_back(*reinterpret_cast<hsa_ext_amd_aql_pm4_packet_t*>(barrier_ptr));
}
void
AddVendorSpecificPacket(const hsa_ext_amd_aql_pm4_packet_t& packet,
std::vector<hsa_ext_amd_aql_pm4_packet_t>& transformed_packets,
const hsa_signal_t& packet_completion_signal)
{
transformed_packets.emplace_back(packet).completion_signal = packet_completion_signal;
}
template <typename Integral = uint64_t>
constexpr Integral
bit_mask(int first, int last)
@@ -145,48 +133,68 @@ WriteInterceptor(const void* packets,
void* data,
hsa_amd_queue_intercept_packet_writer writer)
{
Queue& queue_info = *static_cast<Queue*>(data);
auto&& AddVendorSpecificPacket = [](hsa_ext_amd_aql_pm4_packet_t _packet,
hsa_signal_t _signal,
std::vector<rocprofiler_packet>& _packets) {
_packets.emplace_back(_packet).ext_amd_aql_pm4.completion_signal = _signal;
};
auto&& CreateBarrierPacket = [](hsa_signal_t _signal,
std::vector<rocprofiler_packet>& _packets) {
hsa_barrier_and_packet_t barrier{};
barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
barrier.dep_signal[0] = _signal;
_packets.emplace_back(barrier);
};
LOG_IF(FATAL, data == nullptr) << "WriteInterceptor was not passed a pointer to the queue";
auto& queue = *static_cast<Queue*>(data);
auto thr_id = common::get_tid();
auto* corr_id = context::get_latest_correlation_id();
// increase the reference count to denote that this correlation id is being used in a kernel
if(corr_id) corr_id->ref_count.fetch_add(1);
// We have no packets or no one who needs to be notified, do nothing.
if(pkt_count == 0 || queue_info.get_notifiers() == 0)
if(pkt_count == 0 || queue.get_notifiers() == 0)
{
writer(packets, pkt_count);
return;
}
// hsa_ext_amd_aql_pm4_packet_t
const hsa_ext_amd_aql_pm4_packet_t* packets_arr =
static_cast<const hsa_ext_amd_aql_pm4_packet_t*>(packets);
std::vector<hsa_ext_amd_aql_pm4_packet_t> transformed_packets;
const auto* packets_arr = static_cast<const rocprofiler_packet*>(packets);
auto transformed_packets = std::vector<rocprofiler_packet>{};
// Searching accross all the packets given during this write
for(size_t i = 0; i < pkt_count; ++i)
{
const auto& original_packet = static_cast<const hsa_barrier_and_packet_t*>(packets)[i];
if(bit_extract(original_packet.header,
HSA_PACKET_HEADER_TYPE,
HSA_PACKET_HEADER_TYPE + HSA_PACKET_HEADER_WIDTH_TYPE - 1) !=
HSA_PACKET_TYPE_KERNEL_DISPATCH)
const auto& original_packet = packets_arr[i].kernel_dispatch;
auto packet_type = bit_extract(original_packet.header,
HSA_PACKET_HEADER_TYPE,
HSA_PACKET_HEADER_TYPE + HSA_PACKET_HEADER_WIDTH_TYPE - 1);
if(packet_type != HSA_PACKET_TYPE_KERNEL_DISPATCH)
{
transformed_packets.emplace_back(packets_arr[i]);
continue;
}
// Copy kernel pkt, copy is to allow for signal to be modified
hsa_ext_amd_aql_pm4_packet_t kernel_pkt = packets_arr[i];
queue_info.create_signal(HSA_AMD_SIGNAL_AMD_GPU_ONLY, &kernel_pkt.completion_signal);
rocprofiler_packet kernel_pkt = packets_arr[i];
queue.create_signal(HSA_AMD_SIGNAL_AMD_GPU_ONLY,
&kernel_pkt.ext_amd_aql_pm4.completion_signal);
// Stores the instrumentation pkt (i.e. AQL packets for counter collection)
// along with an ID of the client we got the packet from (this will be returned via
// CompletedCB)
// completed_cb_t)
ClientID inst_pkt_id = -1;
std::unique_ptr<AQLPacket> inst_pkt;
// Signal callbacks that a kernel_pkt is being enqueued
queue_info.signal_callback([&](const auto& map) {
queue.signal_callback([&](const auto& map) {
for(const auto& [client_id, cb_pair] : map)
{
if(auto maybe_pkt = cb_pair.first(queue_info, client_id, kernel_pkt))
if(auto maybe_pkt = cb_pair.first(queue, client_id, kernel_pkt))
{
LOG_IF(FATAL, inst_pkt)
<< "We do not support two injections into the HSA queue";
@@ -196,14 +204,13 @@ WriteInterceptor(const void* packets,
}
});
constexpr auto dummy_signal = hsa_signal_t{.handle = 0};
// Write instrumentation start packet (if one exists)
if(inst_pkt)
{
hsa_signal_t dummy_signal{};
dummy_signal.handle = 0;
inst_pkt->start.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE;
AddVendorSpecificPacket(inst_pkt->start, transformed_packets, dummy_signal);
AddVendorSpecificPacket(inst_pkt->start, dummy_signal, transformed_packets);
CreateBarrierPacket(inst_pkt->start.completion_signal, transformed_packets);
}
@@ -214,25 +221,21 @@ WriteInterceptor(const void* packets,
if(original_packet.completion_signal.handle != 0u)
{
hsa_barrier_and_packet_t barrier{};
barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
hsa_ext_amd_aql_pm4_packet_t* __attribute__((__may_alias__)) pkt =
(reinterpret_cast<hsa_ext_amd_aql_pm4_packet_t*>(&barrier));
transformed_packets.emplace_back(*pkt).completion_signal =
original_packet.completion_signal;
barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
barrier.completion_signal = original_packet.completion_signal;
transformed_packets.emplace_back(barrier);
}
hsa_signal_t interrupt_signal{};
// Adding a barrier packet with the original packet's completion signal.
queue_info.create_signal(0, &interrupt_signal);
queue.create_signal(0, &interrupt_signal);
if(inst_pkt)
{
hsa_signal_t dummy_signal{};
dummy_signal.handle = 0;
inst_pkt->stop.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE;
AddVendorSpecificPacket(inst_pkt->stop, transformed_packets, dummy_signal);
AddVendorSpecificPacket(inst_pkt->stop, dummy_signal, transformed_packets);
inst_pkt->read.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE;
AddVendorSpecificPacket(inst_pkt->read, transformed_packets, interrupt_signal);
AddVendorSpecificPacket(inst_pkt->read, interrupt_signal, transformed_packets);
// Added Interrupt Signal with barrier and provided handler for it
CreateBarrierPacket(interrupt_signal, transformed_packets);
@@ -242,21 +245,25 @@ WriteInterceptor(const void* packets,
hsa_barrier_and_packet_t barrier{};
barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
barrier.completion_signal = interrupt_signal;
hsa_ext_amd_aql_pm4_packet_t* __attribute__((__may_alias__)) pkt =
(reinterpret_cast<hsa_ext_amd_aql_pm4_packet_t*>(&barrier));
transformed_packets.emplace_back(*pkt);
transformed_packets.emplace_back(barrier);
}
// TODO(jrmadsen): fetch kernel identifier from code object loading
uint64_t kernel_id = 0;
// Enqueue the signal into the handler. Will call completed_cb when
// signal completes.
queue_info.async_started();
queue_info.signal_async_handler(
queue.async_started();
queue.signal_async_handler(
interrupt_signal,
new Queue::queue_info_session_t{.queue = queue_info,
new Queue::queue_info_session_t{.queue = queue,
.inst_pkt = std::move(inst_pkt),
.inst_pkt_id = inst_pkt_id,
.kernel_pkt = kernel_pkt,
.interrupt_signal = interrupt_signal});
.interrupt_signal = interrupt_signal,
.tid = thr_id,
.kernel_id = kernel_id,
.correlation_id = corr_id,
.kernel_pkt = kernel_pkt});
}
writer(transformed_packets.data(), transformed_packets.size());
@@ -329,7 +336,7 @@ Queue::Queue(const AgentCache& agent,
}
void
Queue::register_callback(ClientID id, QueueCB enqueue_cb, CompletedCB complete_cb)
Queue::register_callback(ClientID id, queue_cb_t enqueue_cb, completed_cb_t complete_cb)
{
_callbacks.wlock([&](auto& map) {
LOG_IF(FATAL, rocprofiler::common::get_val(map, id)) << "ID already exists!";
+72 -22
Просмотреть файл
@@ -20,9 +20,11 @@
#pragma once
#include <rocprofiler/buffer_tracing.h>
#include <rocprofiler/fwd.h>
#include "lib/common/synchronized.hpp"
#include "lib/common/utility.hpp"
#include "lib/rocprofiler/hsa/agent_cache.hpp"
#include "lib/rocprofiler/hsa/aql_packet.hpp"
@@ -30,6 +32,7 @@
#include <hsa/hsa.h>
#include <hsa/hsa_api_trace.h>
#include <hsa/hsa_ext_amd.h>
#include <hsa/hsa_ven_amd_aqlprofile.h>
#include <hsa/hsa_ven_amd_loader.h>
#include <atomic>
@@ -42,10 +45,49 @@
namespace rocprofiler
{
namespace context
{
struct correlation_id;
}
namespace hsa
{
using ClientID = int64_t;
union rocprofiler_packet
{
hsa_ext_amd_aql_pm4_packet_t ext_amd_aql_pm4;
hsa_kernel_dispatch_packet_t kernel_dispatch;
hsa_barrier_and_packet_t barrier_and;
hsa_barrier_or_packet_t barrier_or;
rocprofiler_packet()
: ext_amd_aql_pm4{null_amd_aql_pm4_packet}
{}
rocprofiler_packet(hsa_ext_amd_aql_pm4_packet_t val)
: ext_amd_aql_pm4{val}
{}
rocprofiler_packet(hsa_kernel_dispatch_packet_t val)
: kernel_dispatch{val}
{}
rocprofiler_packet(hsa_barrier_and_packet_t val)
: barrier_and{val}
{}
rocprofiler_packet(hsa_barrier_or_packet_t val)
: barrier_or{val}
{}
~rocprofiler_packet() = default;
rocprofiler_packet(const rocprofiler_packet&) = default;
rocprofiler_packet(rocprofiler_packet&&) noexcept = default;
rocprofiler_packet& operator=(const rocprofiler_packet&) = default;
rocprofiler_packet& operator=(rocprofiler_packet&&) noexcept = default;
};
// Interceptor for a single specific queue
class Queue
{
@@ -54,24 +96,25 @@ public:
// Function prototype used to notify consumers that a kernel has been
// enqueued. An AQL packet can be returned that will be injected into
// the queue.
using QueueCB = std::function<
std::unique_ptr<AQLPacket>(const Queue&, ClientID, const hsa_ext_amd_aql_pm4_packet_t&)>;
using queue_cb_t = std::function<
std::unique_ptr<AQLPacket>(const Queue&, ClientID, const rocprofiler_packet&)>;
// Signals the completion of the kernel packet.
using CompletedCB = std::function<void(const Queue&,
ClientID,
const hsa_ext_amd_aql_pm4_packet_t&,
std::unique_ptr<AQLPacket>)>;
using callback_map_t = std::unordered_map<ClientID, std::pair<QueueCB, CompletedCB>>;
using completed_cb_t = std::function<
void(const Queue&, ClientID, const rocprofiler_packet&, std::unique_ptr<AQLPacket>)>;
using callback_map_t = std::unordered_map<ClientID, std::pair<queue_cb_t, completed_cb_t>>;
// Internal session information that is used by write interceptor
// to track state of the intercepted kernel.
struct queue_info_session_t
{
Queue& queue;
std::unique_ptr<AQLPacket> inst_pkt = {};
ClientID inst_pkt_id = 0;
hsa_ext_amd_aql_pm4_packet_t kernel_pkt = null_amd_aql_pm4_packet;
hsa_signal_t interrupt_signal = {};
Queue& queue;
std::unique_ptr<AQLPacket> inst_pkt = {};
ClientID inst_pkt_id = 0;
hsa_signal_t interrupt_signal = {};
rocprofiler_thread_id_t tid = common::get_tid();
rocprofiler_kernel_id_t kernel_id = 0;
context::correlation_id* correlation_id = nullptr;
rocprofiler_packet kernel_pkt = {};
};
Queue(const AgentCache& agent,
@@ -93,16 +136,10 @@ public:
void create_signal(uint32_t attribute, hsa_signal_t* signal) const;
void signal_async_handler(const hsa_signal_t& signal, Queue::queue_info_session_t* data) const;
rocprofiler_queue_id_t get_id() const
{
return {.handle = reinterpret_cast<uint64_t>(intercept_queue())};
};
template <typename FuncT>
void signal_callback(FuncT&& func) const;
template <class Func>
void signal_callback(Func&& func) const
{
_callbacks.rlock([&func](const auto& data) { func(data); });
}
rocprofiler_queue_id_t get_id() const;
// Fast check to see if we have any callbacks we need to notify
int get_notifiers() const { return _notifiers; }
@@ -113,7 +150,7 @@ public:
void async_started() { _active_async_packets++; }
void async_complete() { _active_async_packets--; }
void register_callback(ClientID id, QueueCB enqueue_cb, CompletedCB complete_cb);
void register_callback(ClientID id, queue_cb_t enqueue_cb, completed_cb_t complete_cb);
void remove_callback(ClientID id);
const CoreApiTable& core_api() const { return _core_api; }
@@ -129,5 +166,18 @@ private:
hsa_queue_t* _intercept_queue = nullptr;
};
inline rocprofiler_queue_id_t
Queue::get_id() const
{
return {.handle = reinterpret_cast<uint64_t>(intercept_queue())};
};
template <typename FuncT>
inline void
Queue::signal_callback(FuncT&& func) const
{
_callbacks.rlock([&func](const auto& data) { func(data); });
}
} // namespace hsa
} // namespace rocprofiler
+19 -5
Просмотреть файл
@@ -80,7 +80,7 @@ QueueController::add_queue(hsa_queue_t* id, std::unique_ptr<Queue> queue)
CHECK(queue);
_callback_cache.wlock([&](auto& callbacks) {
_queues.wlock([&](auto& map) {
const auto agent_id = queue->get_agent().get_rocp_agent().id.handle;
const auto agent_id = queue->get_agent().get_rocp_agent()->id.handle;
map[id] = std::move(queue);
for(const auto& [cbid, cb_tuple] : callbacks)
{
@@ -102,8 +102,8 @@ QueueController::destory_queue(hsa_queue_t* id)
ClientID
QueueController::add_callback(const rocprofiler_agent_t& agent,
Queue::QueueCB qcb,
Queue::CompletedCB ccb)
Queue::queue_cb_t qcb,
Queue::completed_cb_t ccb)
{
static std::atomic<ClientID> client_id = 1;
ClientID return_id;
@@ -114,7 +114,7 @@ QueueController::add_callback(const rocprofiler_agent_t& agent,
_queues.wlock([&](auto& map) {
for(auto& [_, queue] : map)
{
if(queue->get_agent().get_rocp_agent().id.handle == agent.id.handle)
if(queue->get_agent().get_rocp_agent()->id.handle == agent.id.handle)
{
queue->register_callback(return_id, qcb, ccb);
}
@@ -150,7 +150,7 @@ QueueController::init(CoreApiTable& core_table, AmdExtTable& ext_table)
for(const auto* itr : agents)
{
auto cached_agent = agent::get_agent_cache(itr);
if(cached_agent && cached_agent->get_rocp_agent().type == ROCPROFILER_AGENT_TYPE_GPU)
if(cached_agent && cached_agent->get_rocp_agent()->type == ROCPROFILER_AGENT_TYPE_GPU)
{
get_supported_agents().emplace(cached_agent->index(), *cached_agent);
}
@@ -188,6 +188,20 @@ QueueController::init(CoreApiTable& core_table, AmdExtTable& ext_table)
}
}
const Queue*
QueueController::get_queue(const hsa_queue_t& _hsa_queue) const
{
return _queues.rlock(
[](const queue_map_t& _data, const hsa_queue_t& _inp) -> const Queue* {
for(const auto& itr : _data)
{
if(itr.first->id == _inp.id) return itr.second.get();
}
return nullptr;
},
_hsa_queue);
}
QueueController&
get_queue_controller()
{
+4 -2
Просмотреть файл
@@ -47,7 +47,7 @@ public:
// Add callback to queues associated with the agent. Returns a client
// id that can be used by callers to remove the callback.
ClientID add_callback(const rocprofiler_agent_t&, Queue::QueueCB, Queue::CompletedCB);
ClientID add_callback(const rocprofiler_agent_t&, Queue::queue_cb_t, Queue::completed_cb_t);
void remove_callback(ClientID);
const CoreApiTable& get_core_table() const { return _core_table; }
@@ -57,9 +57,11 @@ public:
const auto& get_supported_agents() const { return _supported_agents; }
auto& get_supported_agents() { return _supported_agents; }
const Queue* get_queue(const hsa_queue_t&) const;
private:
using agent_callback_tuple_t =
std::tuple<rocprofiler_agent_t, Queue::QueueCB, Queue::CompletedCB>;
std::tuple<rocprofiler_agent_t, Queue::queue_cb_t, Queue::completed_cb_t>;
using queue_map_t = std::unordered_map<hsa_queue_t*, std::unique_ptr<Queue>>;
using client_id_map_t = std::unordered_map<ClientID, agent_callback_tuple_t>;
using agent_cache_map_t = std::unordered_map<uint32_t, AgentCache>;
+2 -6
Просмотреть файл
@@ -41,18 +41,14 @@ namespace internal_threading
{
using thread_pool_t = PTL::ThreadPool;
using task_group_t = PTL::TaskGroup<void>;
using thread_pool_cleanup_t = rocprofiler::common::static_cleanup_wrapper<
std::unique_ptr<thread_pool_t>,
std::function<void(std::unique_ptr<thread_pool_t>&)>>;
using thread_pool_cleanup_t = common::static_cleanup_wrapper<std::unique_ptr<thread_pool_t>>;
using task_group_cleanup_t =
std::pair<std::unique_ptr<task_group_t>, std::shared_ptr<thread_pool_cleanup_t>>;
using thread_pool_vec_t = std::vector<std::shared_ptr<thread_pool_cleanup_t>>;
// Note: task_group maintains a shared_ptr copy to thread_pool to ensure it is not destroyed
// before the task can be sync'd.
using task_group_vec_t = rocprofiler::common::static_cleanup_wrapper<
std::vector<task_group_cleanup_t>,
std::function<void(std::vector<task_group_cleanup_t>&)>>;
using task_group_vec_t = common::static_cleanup_wrapper<std::vector<task_group_cleanup_t>>;
void notify_pre_internal_thread_create(rocprofiler_runtime_library_t);
void notify_post_internal_thread_create(rocprofiler_runtime_library_t);
+2 -2
Просмотреть файл
@@ -351,6 +351,7 @@ invoke_client_finalizers()
// set to nullptr so finalize only gets called once
itr.configure_result->finalize = nullptr;
}
context::deactivate_client_contexts(itr.internal_client_id);
}
return true;
@@ -377,6 +378,7 @@ invoke_client_finalizer(rocprofiler_client_id_t client_id)
// set to nullptr so finalize only gets called once
itr.configure_result->finalize = nullptr;
}
context::deactivate_client_contexts(itr.internal_client_id);
}
}
}
@@ -472,8 +474,6 @@ finalize()
if(get_init_status() > 0)
{
invoke_client_finalizers();
for(auto& itr : rocprofiler::context::get_active_contexts())
itr.store(nullptr, std::memory_order_seq_cst);
}
internal_threading::finalize();
set_fini_status(1);
+3 -3
Просмотреть файл
@@ -52,9 +52,9 @@ ROCPROFILER_STATUS_STRING(ROCPROFILER_STATUS_ERROR_AGENT_NOT_FOUND, "Agent ID no
ROCPROFILER_STATUS_STRING(ROCPROFILER_STATUS_ERROR_COUNTER_NOT_FOUND, "HW counter not found")
ROCPROFILER_STATUS_STRING(ROCPROFILER_STATUS_ERROR_CONTEXT_INVALID,
"Context configuration is not valid")
ROCPROFILER_STATUS_STRING(
ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_STARTED,
"Context failed to be started (may be already started or atomic swap may have failed)")
ROCPROFILER_STATUS_STRING(ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_STARTED, "Context failed to start")
ROCPROFILER_STATUS_STRING(ROCPROFILER_STATUS_ERROR_CONTEXT_CONFLICT,
"Context has a conflict with another context")
ROCPROFILER_STATUS_STRING(
ROCPROFILER_STATUS_ERROR_BUFFER_BUSY,
"Buffer operation failed because it is currently busy handling another request")
+5 -1
Просмотреть файл
@@ -32,6 +32,7 @@
#include <dlfcn.h>
#include <pthread.h>
#include <chrono>
#include <cmath>
#include <cstdint>
#include <cstdlib>
#include <filesystem>
@@ -483,8 +484,11 @@ TEST(rocprofiler_lib, callback_registration_lambda_with_result)
EXPECT_GT(elapsed, 0);
#else
decltype(elapsed) elapsed_tolerance = 0.25 * elapsed;
int64_t diff = (cb_data.client_elapsed - elapsed);
auto frac = std::abs(diff) / (1.0 * elapsed);
EXPECT_NEAR(elapsed, cb_data.client_elapsed, elapsed_tolerance)
<< "it is possible this failed due to noise on the machine";
<< "% diff = " << std::fixed << std::setprecision(3) << (100.0 * frac)
<< "%. It is possible this failed due to noise on the machine";
#endif
ASSERT_NE(cb_data.client_id, nullptr);