Generalized updates (#174)
- include/rocprofiler/agent.h
- move rocprofiler_dim3_t
- include/rocprofiler/buffer_tracing.h
- size fields
- update kernel dispatch record
- include/rocprofiler/callback_tracing.h
- remove rocprofiler_callback_tracing_code_object_unload_data_t
- remove rocprofiler_callback_tracing_code_object_register_host_kernel_symbol_data_t
- include/rocprofiler/fwd.h
- added ROCPROFILER_STATUS_ERROR_CONTEXT_CONFLICT
- remove ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_UNLOAD
- remove ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_UNREGISTER
- add rocprofiler_kernel_id_t typedef
- add rocprofiler_dim3_t (moved from agent.h)
- lib/common/synchronized.hpp
- rlock/wlock return decltype(auto)
- separate prototype from definition
- lib/common/utility.{hpp,cpp}
- timestamp functions replicating HSA implementation(s)
- init_public_api_struct for setting size field and ensuring certain type traits
- simplified static_cleanup_wrapper
- separate prototype from definition in active_capacity_gate
- lib/rocprofiler/agent.cpp
- tweak get_rocprofiler_agent impl
- lib/rocprofiler/buffer.cpp
- fix buffer message log level
- lib/rocprofiler/context.cpp
- use new paradigm for getting active contexts
- lib/rocprofiler/internal_threading.hpp
- update to simplified static_cleanup_wrapper implementation
- lib/rocprofiler/registration.cpp
- fix deactivating contexts
- lib/rocprofiler/rocprofiler.cpp
- status string for context conflict
- lib/rocprofiler/context/context.*
- correlation_id struct
- new get_active_contexts paradigm
- lib/rocprofiler/counters/core.*
- rocprofiler_packet union
- tweak start/stop context to accept pointer instead of handle
- lib/rocprofiler/counters/dimensions.cpp
- update to new get_rocp_agent() return type
- lib/rocprofiler/hsa/hsa.*
- update to new get_active_contexts paradigm
- update to new correlation id implementation
- guard against hsa.def.cpp direct compilation
- lib/rocprofiler/hsa/queue_controller.*
- update to change in get_rocp_agent return type
- consistent aliases
- lookup function for getting queue pointer from hsa queue id
- lib/rocprofiler/hsa/queue.*
- rocprofiler_packet
- extend queue_info_session_t
- lib/rocprofiler/tests/registration.cpp
- improve diagnostic on perf check for rocprofiler_lib.callback_registration_lambda_with_result
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
63775f241a
Коммит
55f2dabbb3
@@ -86,16 +86,6 @@ typedef struct rocprofiler_agent_mem_bank_t
|
||||
uint64_t size_in_bytes; ///< physical memory size of the memory range in bytes
|
||||
} rocprofiler_agent_mem_bank_t;
|
||||
|
||||
/**
|
||||
* @brief Multi-dimensional struct of data
|
||||
*/
|
||||
typedef struct rocprofiler_dim3_t
|
||||
{
|
||||
uint32_t x;
|
||||
uint32_t y;
|
||||
uint32_t z;
|
||||
} rocprofiler_dim3_t;
|
||||
|
||||
/**
|
||||
* @brief Agent.
|
||||
*/
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <rocprofiler/agent.h>
|
||||
#include <rocprofiler/defines.h>
|
||||
#include <rocprofiler/fwd.h>
|
||||
|
||||
@@ -39,6 +40,7 @@ ROCPROFILER_EXTERN_C_INIT
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint64_t size;
|
||||
rocprofiler_service_buffer_tracing_kind_t kind;
|
||||
rocprofiler_correlation_id_t correlation_id;
|
||||
rocprofiler_tracing_operation_t operation; // rocprofiler/hsa.h
|
||||
@@ -52,6 +54,7 @@ typedef struct
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint64_t size;
|
||||
rocprofiler_service_buffer_tracing_kind_t kind;
|
||||
rocprofiler_correlation_id_t correlation_id;
|
||||
rocprofiler_tracing_operation_t operation; // rocprofiler/hip.h
|
||||
@@ -65,6 +68,7 @@ typedef struct
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint64_t size;
|
||||
rocprofiler_service_buffer_tracing_kind_t kind;
|
||||
rocprofiler_correlation_id_t correlation_id;
|
||||
rocprofiler_tracing_operation_t operation; // rocprofiler/marker.h
|
||||
@@ -79,6 +83,7 @@ typedef struct
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint64_t size;
|
||||
rocprofiler_service_buffer_tracing_kind_t kind;
|
||||
rocprofiler_correlation_id_t correlation_id;
|
||||
/**
|
||||
@@ -96,12 +101,18 @@ typedef struct
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint64_t size;
|
||||
rocprofiler_service_buffer_tracing_kind_t kind;
|
||||
rocprofiler_correlation_id_t correlation_id;
|
||||
rocprofiler_timestamp_t start_timestamp;
|
||||
rocprofiler_timestamp_t end_timestamp;
|
||||
rocprofiler_agent_id_t agent_id;
|
||||
rocprofiler_queue_id_t queue_id;
|
||||
const char* kernel_name;
|
||||
rocprofiler_kernel_id_t kernel_id;
|
||||
uint32_t private_segment_size;
|
||||
uint32_t group_segment_size;
|
||||
rocprofiler_dim3_t workgroup_size;
|
||||
rocprofiler_dim3_t grid_size;
|
||||
} rocprofiler_buffer_tracing_kernel_dispatch_record_t;
|
||||
|
||||
/**
|
||||
@@ -109,6 +120,7 @@ typedef struct
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint64_t size;
|
||||
rocprofiler_service_buffer_tracing_kind_t kind;
|
||||
rocprofiler_correlation_id_t correlation_id;
|
||||
rocprofiler_timestamp_t start_timestamp;
|
||||
@@ -122,6 +134,7 @@ typedef struct
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint64_t size;
|
||||
rocprofiler_service_buffer_tracing_kind_t kind;
|
||||
rocprofiler_correlation_id_t correlation_id;
|
||||
rocprofiler_timestamp_t start_timestamp;
|
||||
@@ -135,6 +148,7 @@ typedef struct
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint64_t size;
|
||||
rocprofiler_service_buffer_tracing_kind_t kind;
|
||||
rocprofiler_correlation_id_t correlation_id;
|
||||
rocprofiler_timestamp_t start_timestamp;
|
||||
@@ -198,6 +212,7 @@ typedef struct
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint64_t size;
|
||||
rocprofiler_service_buffer_tracing_kind_t kind;
|
||||
rocprofiler_correlation_id_t correlation_id;
|
||||
} rocprofiler_buffer_tracing_correlation_record_t;
|
||||
|
||||
@@ -49,6 +49,7 @@ typedef enum
|
||||
ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_FILE = HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE,
|
||||
ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_MEMORY =
|
||||
HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY,
|
||||
ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_LAST,
|
||||
} rocprofiler_code_object_storage_type_t;
|
||||
|
||||
/**
|
||||
@@ -124,16 +125,6 @@ typedef struct
|
||||
};
|
||||
} rocprofiler_callback_tracing_code_object_load_data_t;
|
||||
|
||||
/**
|
||||
* @brief ROCProfiler Code Object UnLoad Tracer Callback Record.
|
||||
*
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint64_t size; ///< size of this struct
|
||||
uint64_t code_object_id; ///< unique code object identifier
|
||||
} rocprofiler_callback_tracing_code_object_unload_data_t;
|
||||
|
||||
/**
|
||||
* @brief ROCProfiler Code Object Kernel Symbol Tracer Callback Record.
|
||||
*
|
||||
@@ -154,21 +145,7 @@ typedef struct
|
||||
///< (per work-group), in bytes
|
||||
uint32_t private_segment_size; ///< Size of static private, spill, and arg segment memory
|
||||
///< required by this kernel (per work-item), in bytes.
|
||||
} rocprofiler_callback_tracing_code_object_kernel_symbol_data_t;
|
||||
|
||||
/**
|
||||
* @brief ROCProfiler Code Object Register Host Kernel Symbol Tracer Callback
|
||||
* Record.
|
||||
*
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint64_t size; ///< size of this struct
|
||||
rocprofiler_address_t host_address; // host address
|
||||
// Should this be nullptr if it is unregister?
|
||||
const char* kernel_name; // kernel name string (NULL terminated)
|
||||
rocprofiler_address_t kernel_descriptor; // kernel descriptor
|
||||
} rocprofiler_callback_tracing_code_object_register_host_kernel_symbol_data_t;
|
||||
} rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t;
|
||||
|
||||
/**
|
||||
* @brief API Tracing callback function. This function is invoked twice per API function: once
|
||||
|
||||
@@ -58,9 +58,10 @@ typedef enum // NOLINT(performance-enum-size)
|
||||
ROCPROFILER_STATUS_ERROR_COUNTER_NOT_FOUND, ///< Counter identifier does not exist
|
||||
ROCPROFILER_STATUS_ERROR_CONTEXT_ERROR, ///> Generalized context error
|
||||
ROCPROFILER_STATUS_ERROR_CONTEXT_INVALID, ///< Context configuration is not valid
|
||||
ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_STARTED, ///< Context was not started (maybe already
|
||||
///< started or atomic swap into active array
|
||||
///< failed)
|
||||
ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_STARTED, ///< Context was not started (e.g., atomic swap
|
||||
///< into active array failed)
|
||||
ROCPROFILER_STATUS_ERROR_CONTEXT_CONFLICT, ///< Context operation failed due to a conflict with
|
||||
///< another context
|
||||
ROCPROFILER_STATUS_ERROR_BUFFER_BUSY, ///< buffer operation failed because it currently busy
|
||||
///< handling another request (e.g. flushing)
|
||||
ROCPROFILER_STATUS_ERROR_SERVICE_ALREADY_CONFIGURED, ///< service has already been configured
|
||||
@@ -154,12 +155,9 @@ typedef enum // NOLINT(performance-enum-size)
|
||||
{
|
||||
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_NONE = 0,
|
||||
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_LOAD,
|
||||
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_UNLOAD,
|
||||
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER,
|
||||
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_UNREGISTER,
|
||||
// next two are part of hipRegisterFunction API.
|
||||
// ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_HOST_KERNEL_SYMBOL_REGISTER,
|
||||
// ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_HOST_KERNEL_SYMBOL_UNREGISTER,
|
||||
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_LAST,
|
||||
} rocprofiler_callback_tracing_code_object_operation_t;
|
||||
|
||||
@@ -251,6 +249,12 @@ typedef uint64_t rocprofiler_thread_id_t;
|
||||
*/
|
||||
typedef uint32_t rocprofiler_tracing_operation_t;
|
||||
|
||||
/**
|
||||
* @brief Kernel identifier type
|
||||
*
|
||||
*/
|
||||
typedef uint64_t rocprofiler_kernel_id_t;
|
||||
|
||||
// forward declaration of struct
|
||||
typedef struct rocprofiler_pc_sampling_configuration_s rocprofiler_pc_sampling_configuration_t;
|
||||
|
||||
@@ -348,6 +352,16 @@ typedef struct
|
||||
uint64_t handle;
|
||||
} rocprofiler_profile_config_id_t;
|
||||
|
||||
/**
|
||||
* @brief Multi-dimensional struct of data used to describe GPU workgroup and grid sizes
|
||||
*/
|
||||
typedef struct rocprofiler_dim3_t
|
||||
{
|
||||
uint32_t x;
|
||||
uint32_t y;
|
||||
uint32_t z;
|
||||
} rocprofiler_dim3_t;
|
||||
|
||||
/**
|
||||
* @brief Tracing record
|
||||
*
|
||||
|
||||
@@ -75,24 +75,10 @@ public:
|
||||
Synchronized& operator=(const Synchronized&) = delete;
|
||||
|
||||
template <typename FuncT, typename... Args>
|
||||
auto rlock(FuncT&& lambda, Args&&... args) const
|
||||
{
|
||||
static_assert(std::is_invocable<FuncT, const value_type&, Args...>::value,
|
||||
"function must accept const reference to locked type");
|
||||
|
||||
auto lock = std::shared_lock{m_mutex};
|
||||
return std::forward<FuncT>(lambda)(m_data, std::forward<Args>(args)...);
|
||||
}
|
||||
decltype(auto) rlock(FuncT&& lambda, Args&&... args) const;
|
||||
|
||||
template <typename FuncT, typename... Args>
|
||||
auto wlock(FuncT&& lambda, Args&&... args)
|
||||
{
|
||||
static_assert(std::is_invocable<FuncT, value_type&, Args...>::value,
|
||||
"function must accept reference to locked type");
|
||||
|
||||
auto lock = std::unique_lock{m_mutex};
|
||||
return std::forward<FuncT>(lambda)(m_data, std::forward<Args>(args)...);
|
||||
}
|
||||
decltype(auto) wlock(FuncT&& lambda, Args&&... args);
|
||||
|
||||
// This overload to wlock allows a synchronized map whose keys map to synchronized data to
|
||||
// use a read lock on the key data and then a write lock on the mapped data.
|
||||
@@ -100,42 +86,86 @@ public:
|
||||
typename... Args,
|
||||
bool EnableForMappedType = IsMappedTypeV,
|
||||
std::enable_if_t<EnableForMappedType, int> = 0>
|
||||
auto wlock(FuncT&& lambda, Args&&... args) const
|
||||
{
|
||||
return const_cast<this_type*>(this)->wlock(std::forward<FuncT>(lambda),
|
||||
std::forward<Args>(args)...);
|
||||
}
|
||||
decltype(auto) wlock(FuncT&& lambda, Args&&... args) const;
|
||||
|
||||
// Upgradable lock. If read returns false, write will be called with a unique_lock.
|
||||
// Essentially a helper function that does .rlock() followed by .wlock().
|
||||
template <typename ReadFuncT, typename WriteFuncT, typename... Args>
|
||||
bool ulock(ReadFuncT&& read, WriteFuncT&& write, Args&&... args)
|
||||
{
|
||||
static_assert(std::is_invocable<ReadFuncT, const value_type&, Args...>::value,
|
||||
"read function must accept const reference to locked type");
|
||||
static_assert(std::is_invocable<WriteFuncT, value_type&, Args...>::value,
|
||||
"write function must accept reference to locked type");
|
||||
|
||||
using read_return_type = std::invoke_result_t<ReadFuncT, const value_type&, Args...>;
|
||||
using write_return_type = std::invoke_result_t<WriteFuncT, value_type&, Args...>;
|
||||
|
||||
static_assert(std::is_same<read_return_type, write_return_type>::value,
|
||||
"read and write functions must return same type");
|
||||
static_assert(std::is_same<read_return_type, bool>::value,
|
||||
"read/write functions must return bool");
|
||||
|
||||
{
|
||||
auto lock = std::shared_lock{m_mutex};
|
||||
if(read(m_data, std::forward<Args>(args)...)) return true;
|
||||
}
|
||||
|
||||
auto lock = std::unique_lock{m_mutex};
|
||||
return write(m_data, std::forward<Args>(args)...);
|
||||
}
|
||||
bool ulock(ReadFuncT&& read, WriteFuncT&& write, Args&&... args);
|
||||
|
||||
private:
|
||||
mutable std::shared_mutex m_mutex = {};
|
||||
value_type m_data = {};
|
||||
};
|
||||
|
||||
//
|
||||
// member definitions
|
||||
//
|
||||
template <typename LockedType, bool IsMappedTypeV>
|
||||
template <typename FuncT, typename... Args>
|
||||
decltype(auto)
|
||||
Synchronized<LockedType, IsMappedTypeV>::rlock(FuncT&& lambda, Args&&... args) const
|
||||
{
|
||||
static_assert(std::is_invocable<FuncT, const value_type&, Args...>::value,
|
||||
"function must accept const reference to locked type");
|
||||
|
||||
auto lock = std::shared_lock{m_mutex};
|
||||
return std::forward<FuncT>(lambda)(m_data, std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
template <typename LockedType, bool IsMappedTypeV>
|
||||
template <typename FuncT, typename... Args>
|
||||
decltype(auto)
|
||||
Synchronized<LockedType, IsMappedTypeV>::wlock(FuncT&& lambda, Args&&... args)
|
||||
{
|
||||
static_assert(std::is_invocable<FuncT, value_type&, Args...>::value,
|
||||
"function must accept reference to locked type");
|
||||
|
||||
auto lock = std::unique_lock{m_mutex};
|
||||
return std::forward<FuncT>(lambda)(m_data, std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
// This overload to wlock allows a synchronized map whose keys map to synchronized data to
|
||||
// use a read lock on the key data and then a write lock on the mapped data.
|
||||
template <typename LockedType, bool IsMappedTypeV>
|
||||
template <typename FuncT,
|
||||
typename... Args,
|
||||
bool EnableForMappedType,
|
||||
std::enable_if_t<EnableForMappedType, int>>
|
||||
decltype(auto)
|
||||
Synchronized<LockedType, IsMappedTypeV>::wlock(FuncT&& lambda, Args&&... args) const
|
||||
{
|
||||
return const_cast<this_type*>(this)->wlock(std::forward<FuncT>(lambda),
|
||||
std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
// Upgradable lock. If read returns false, write will be called with a unique_lock.
|
||||
// Essentially a helper function that does .rlock() followed by .wlock().
|
||||
template <typename LockedType, bool IsMappedTypeV>
|
||||
template <typename ReadFuncT, typename WriteFuncT, typename... Args>
|
||||
bool
|
||||
Synchronized<LockedType, IsMappedTypeV>::ulock(ReadFuncT&& read, WriteFuncT&& write, Args&&... args)
|
||||
{
|
||||
static_assert(std::is_invocable<ReadFuncT, const value_type&, Args...>::value,
|
||||
"read function must accept const reference to locked type");
|
||||
static_assert(std::is_invocable<WriteFuncT, value_type&, Args...>::value,
|
||||
"write function must accept reference to locked type");
|
||||
|
||||
using read_return_type = std::invoke_result_t<ReadFuncT, const value_type&, Args...>;
|
||||
using write_return_type = std::invoke_result_t<WriteFuncT, value_type&, Args...>;
|
||||
|
||||
static_assert(std::is_same<read_return_type, write_return_type>::value,
|
||||
"read and write functions must return same type");
|
||||
static_assert(std::is_same<read_return_type, bool>::value,
|
||||
"read/write functions must return bool");
|
||||
|
||||
{
|
||||
auto lock = std::shared_lock{m_mutex};
|
||||
if(read(m_data, std::forward<Args>(args)...)) return true;
|
||||
}
|
||||
|
||||
auto lock = std::unique_lock{m_mutex};
|
||||
return write(m_data, std::forward<Args>(args)...);
|
||||
}
|
||||
} // namespace common
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -21,16 +21,100 @@
|
||||
|
||||
#include "lib/common/utility.hpp"
|
||||
|
||||
#include <glog/logging.h>
|
||||
|
||||
#include <unistd.h>
|
||||
#include <cerrno>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "lib/common/defines.hpp"
|
||||
|
||||
namespace rocprofiler
|
||||
{
|
||||
namespace common
|
||||
{
|
||||
namespace
|
||||
{
|
||||
std::string_view
|
||||
get_clock_name(clockid_t _id)
|
||||
{
|
||||
#define CLOCK_NAME_CASE_STATEMENT(NAME) \
|
||||
case NAME: return #NAME;
|
||||
switch(_id)
|
||||
{
|
||||
CLOCK_NAME_CASE_STATEMENT(CLOCK_REALTIME)
|
||||
CLOCK_NAME_CASE_STATEMENT(CLOCK_MONOTONIC)
|
||||
CLOCK_NAME_CASE_STATEMENT(CLOCK_PROCESS_CPUTIME_ID)
|
||||
CLOCK_NAME_CASE_STATEMENT(CLOCK_THREAD_CPUTIME_ID)
|
||||
CLOCK_NAME_CASE_STATEMENT(CLOCK_MONOTONIC_RAW)
|
||||
CLOCK_NAME_CASE_STATEMENT(CLOCK_REALTIME_COARSE)
|
||||
CLOCK_NAME_CASE_STATEMENT(CLOCK_MONOTONIC_COARSE)
|
||||
CLOCK_NAME_CASE_STATEMENT(CLOCK_BOOTTIME)
|
||||
CLOCK_NAME_CASE_STATEMENT(CLOCK_REALTIME_ALARM)
|
||||
CLOCK_NAME_CASE_STATEMENT(CLOCK_BOOTTIME_ALARM)
|
||||
CLOCK_NAME_CASE_STATEMENT(CLOCK_TAI)
|
||||
default: break;
|
||||
}
|
||||
return "CLOCK_UNKNOWN";
|
||||
}
|
||||
} // namespace
|
||||
|
||||
clockid_t
|
||||
get_accurate_clock_id_impl()
|
||||
{
|
||||
auto clock = CLOCK_MONOTONIC;
|
||||
utsname kernelInfo;
|
||||
if(uname(&kernelInfo) == 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
std::string ver = kernelInfo.release;
|
||||
size_t idx;
|
||||
int major = std::stoi(ver, &idx);
|
||||
int minor = std::stoi(ver.substr(idx + 1));
|
||||
if(major > 4 || ((major == 4) && (minor >= 4)))
|
||||
{
|
||||
clock = CLOCK_MONOTONIC_RAW;
|
||||
}
|
||||
} catch(...)
|
||||
{
|
||||
// Kernel version string doesn't conform to the standard pattern.
|
||||
// Keep using the "safe" (non-RAW) clock.
|
||||
}
|
||||
}
|
||||
return clock;
|
||||
}
|
||||
|
||||
uint64_t
|
||||
get_clock_freq_ns_impl(clockid_t _clk_id)
|
||||
{
|
||||
constexpr auto nanosec = std::nano::den;
|
||||
|
||||
struct timespec ts;
|
||||
auto ret = clock_getres(_clk_id, &ts);
|
||||
|
||||
if(ROCPROFILER_UNLIKELY(ret != 0))
|
||||
{
|
||||
auto _err = errno;
|
||||
LOG(FATAL) << "error getting clock resolution for " << get_clock_name(_clk_id) << ": "
|
||||
<< strerror(_err);
|
||||
}
|
||||
else if(ROCPROFILER_UNLIKELY(ts.tv_sec != 0 ||
|
||||
ts.tv_nsec >= std::numeric_limits<uint32_t>::max()))
|
||||
{
|
||||
LOG(FATAL) << "clock_getres(" << get_clock_name(_clk_id)
|
||||
<< ") returned very low frequency (<1Hz)";
|
||||
}
|
||||
|
||||
auto&& _period =
|
||||
(static_cast<uint64_t>(ts.tv_sec) * nanosec) + static_cast<uint64_t>(ts.tv_nsec);
|
||||
return nanosec / _period;
|
||||
}
|
||||
|
||||
std::vector<std::string>
|
||||
read_command_line(pid_t _pid)
|
||||
{
|
||||
|
||||
@@ -22,19 +22,37 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "lib/common/defines.hpp"
|
||||
|
||||
#include <glog/logging.h>
|
||||
|
||||
#include <sys/syscall.h>
|
||||
#include <sys/utsname.h>
|
||||
#include <unistd.h>
|
||||
#include <chrono>
|
||||
#include <condition_variable>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <functional>
|
||||
#include <mutex>
|
||||
#include <ratio>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
|
||||
namespace rocprofiler
|
||||
{
|
||||
namespace common
|
||||
{
|
||||
clockid_t
|
||||
get_accurate_clock_id_impl();
|
||||
|
||||
uint64_t
|
||||
get_clock_freq_ns_impl(clockid_t _clk_id);
|
||||
|
||||
inline uint64_t
|
||||
get_tid()
|
||||
{
|
||||
@@ -43,11 +61,51 @@ get_tid()
|
||||
return _v;
|
||||
}
|
||||
|
||||
inline clockid_t
|
||||
get_accurate_clock_id()
|
||||
{
|
||||
static auto clk_id = get_accurate_clock_id_impl();
|
||||
return clk_id;
|
||||
}
|
||||
|
||||
inline uint64_t
|
||||
get_accurate_clock_freq_ns()
|
||||
{
|
||||
static auto clk_freq = get_clock_freq_ns_impl(get_accurate_clock_id());
|
||||
return clk_freq;
|
||||
}
|
||||
|
||||
inline uint64_t
|
||||
get_ticks(clockid_t clk_id_v) noexcept
|
||||
{
|
||||
constexpr auto nanosec = std::nano::den;
|
||||
auto&& ts = timespec{};
|
||||
auto ret = clock_gettime(clk_id_v, &ts);
|
||||
|
||||
if(ROCPROFILER_UNLIKELY(ret != 0))
|
||||
{
|
||||
auto _err = errno;
|
||||
LOG(FATAL) << "clock_gettime failed: " << strerror(_err);
|
||||
}
|
||||
|
||||
return (static_cast<uint64_t>(ts.tv_sec) * nanosec) + static_cast<uint64_t>(ts.tv_nsec);
|
||||
}
|
||||
|
||||
// this equates to HSA-runtime library implementation of os::ReadAccurateClock()
|
||||
inline uint64_t
|
||||
timestamp_ns()
|
||||
{
|
||||
// TODO(jrmadsen): this should be updated to the HSA method
|
||||
return std::chrono::steady_clock::now().time_since_epoch().count();
|
||||
return get_ticks(get_accurate_clock_id()) * get_accurate_clock_freq_ns();
|
||||
}
|
||||
|
||||
// this equates to HSA-runtime library implementation of os::ReadSystemClock()
|
||||
inline uint64_t
|
||||
system_timestamp_ns()
|
||||
{
|
||||
constexpr auto boottime_clk = CLOCK_BOOTTIME;
|
||||
static auto boottime_clk_freq = get_clock_freq_ns_impl(boottime_clk);
|
||||
|
||||
return get_ticks(boottime_clk) * boottime_clk_freq;
|
||||
}
|
||||
|
||||
std::vector<std::string>
|
||||
@@ -69,34 +127,85 @@ get_val(Container& map, const Key& key)
|
||||
return (pos != map.end() ? &pos->second : nullptr);
|
||||
}
|
||||
|
||||
template <typename Tp>
|
||||
constexpr void
|
||||
assert_public_api_struct_properties()
|
||||
{
|
||||
static_assert(std::is_class<Tp>::value, "this is not a public API struct");
|
||||
static_assert(std::is_standard_layout<Tp>::value,
|
||||
"public API struct should have a standard layout");
|
||||
static_assert(std::is_trivially_default_constructible<Tp>::value,
|
||||
"public API struct should be trivially default constructible");
|
||||
static_assert(std::is_trivially_copy_constructible<Tp>::value,
|
||||
"public API struct should be trivially copy constructible");
|
||||
static_assert(std::is_trivially_move_constructible<Tp>::value,
|
||||
"public API struct should be trivially move constructible");
|
||||
static_assert(std::is_trivially_copy_assignable<Tp>::value,
|
||||
"public API struct should be trivially move assignable");
|
||||
static_assert(std::is_trivially_move_assignable<Tp>::value,
|
||||
"public API struct should be trivially move assignable");
|
||||
static_assert(std::is_trivially_copyable<Tp>::value,
|
||||
"public API struct should be trivially move assignable");
|
||||
static_assert(std::is_trivial<Tp>::value, "public API struct should be trivial");
|
||||
static_assert(offsetof(Tp, size) == 0, "public API struct should have a size field first");
|
||||
static_assert(sizeof(std::declval<Tp>().size) == sizeof(uint64_t),
|
||||
"public API struct size field should be 64 bits");
|
||||
}
|
||||
|
||||
template <typename Tp>
|
||||
decltype(auto)
|
||||
init_public_api_struct(Tp&& val)
|
||||
{
|
||||
assert_public_api_struct_properties<Tp>();
|
||||
|
||||
::memset(&val, 0, sizeof(Tp));
|
||||
val.size = sizeof(Tp);
|
||||
return std::forward<Tp>(val);
|
||||
}
|
||||
|
||||
template <typename Tp>
|
||||
Tp&
|
||||
init_public_api_struct(Tp& val)
|
||||
{
|
||||
assert_public_api_struct_properties<Tp>();
|
||||
|
||||
::memset(&val, 0, sizeof(Tp));
|
||||
val.size = sizeof(Tp);
|
||||
return val;
|
||||
}
|
||||
|
||||
/**
|
||||
* A simple wrapper that will call a function when the
|
||||
* wrapper is being destroyed. This is primarily useful
|
||||
* for static variables where we want to run some destruction
|
||||
* operations when the program exits.
|
||||
*/
|
||||
template <typename T, typename L>
|
||||
template <typename Tp>
|
||||
class static_cleanup_wrapper
|
||||
{
|
||||
public:
|
||||
static_cleanup_wrapper(T&& data, L&& destroy_func)
|
||||
: _data(std::move(data))
|
||||
, _destroy_func(destroy_func)
|
||||
using data_type = Tp;
|
||||
using functor_type = std::function<void(Tp&)>;
|
||||
|
||||
static_cleanup_wrapper(data_type&& data, functor_type&& destroy_func)
|
||||
: m_data(std::move(data))
|
||||
, m_destroy_func(std::move(destroy_func))
|
||||
{}
|
||||
|
||||
static_cleanup_wrapper(L&& destroy_func)
|
||||
: _destroy_func(destroy_func)
|
||||
static_cleanup_wrapper(functor_type&& destroy_func)
|
||||
: m_destroy_func(std::move(destroy_func))
|
||||
{}
|
||||
|
||||
~static_cleanup_wrapper() { _destroy_func(_data); }
|
||||
~static_cleanup_wrapper() { m_destroy_func(m_data); }
|
||||
|
||||
void destroy() { _destroy_func(_data); }
|
||||
void destroy() { m_destroy_func(m_data); }
|
||||
|
||||
T& get() { return _data; }
|
||||
data_type& get() { return m_data; }
|
||||
const data_type& get() const { return m_data; }
|
||||
|
||||
private:
|
||||
T _data;
|
||||
L _destroy_func;
|
||||
data_type m_data = {};
|
||||
functor_type m_destroy_func = {};
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -107,41 +216,10 @@ private:
|
||||
class active_capacity_gate
|
||||
{
|
||||
public:
|
||||
active_capacity_gate(size_t capacity)
|
||||
: _capacity(capacity)
|
||||
{}
|
||||
void add_active(size_t size)
|
||||
{
|
||||
if(size >= _capacity)
|
||||
{
|
||||
throw std::runtime_error("Size exceeds gate capacity");
|
||||
}
|
||||
active_capacity_gate(size_t capacity);
|
||||
|
||||
std::unique_lock lock(_m);
|
||||
if(_count + size < _capacity)
|
||||
{
|
||||
_count += size;
|
||||
return;
|
||||
}
|
||||
_waiters++;
|
||||
_cv.wait(lock, [&]() { return _count + size < _capacity; });
|
||||
_waiters--;
|
||||
_count += size;
|
||||
}
|
||||
|
||||
void remove_active(size_t size)
|
||||
{
|
||||
std::unique_lock lock(_m);
|
||||
if(_count > size)
|
||||
_count -= size;
|
||||
else
|
||||
_count = 0;
|
||||
|
||||
if(_waiters > 0)
|
||||
{
|
||||
_cv.notify_all();
|
||||
}
|
||||
}
|
||||
void add_active(size_t size);
|
||||
void remove_active(size_t size);
|
||||
|
||||
private:
|
||||
size_t _count{0};
|
||||
@@ -151,5 +229,44 @@ private:
|
||||
std::condition_variable _cv;
|
||||
};
|
||||
|
||||
inline active_capacity_gate::active_capacity_gate(size_t capacity)
|
||||
: _capacity(capacity)
|
||||
{}
|
||||
|
||||
inline void
|
||||
active_capacity_gate::add_active(size_t size)
|
||||
{
|
||||
if(size >= _capacity)
|
||||
{
|
||||
throw std::runtime_error("Size exceeds gate capacity");
|
||||
}
|
||||
|
||||
std::unique_lock lock(_m);
|
||||
if(_count + size < _capacity)
|
||||
{
|
||||
_count += size;
|
||||
return;
|
||||
}
|
||||
_waiters++;
|
||||
_cv.wait(lock, [&]() { return _count + size < _capacity; });
|
||||
_waiters--;
|
||||
_count += size;
|
||||
}
|
||||
|
||||
inline void
|
||||
active_capacity_gate::remove_active(size_t size)
|
||||
{
|
||||
std::unique_lock lock(_m);
|
||||
if(_count > size)
|
||||
_count -= size;
|
||||
else
|
||||
_count = 0;
|
||||
|
||||
if(_waiters > 0)
|
||||
{
|
||||
_cv.notify_all();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace common
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -786,7 +786,7 @@ get_rocprofiler_agent(hsa_agent_t agent)
|
||||
{
|
||||
for(const auto& itr : get_agent_caches())
|
||||
{
|
||||
if(itr == agent) return &itr.get_rocp_agent();
|
||||
if(itr == agent) return itr.get_rocp_agent();
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
|
||||
@@ -143,7 +143,7 @@ flush(rocprofiler_buffer_id_t buffer_id, bool wait)
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG(ERROR) << "buffer at " << buffer_id.handle << " is empty...";
|
||||
LOG(INFO) << "buffer at " << buffer_id.handle << " is empty...";
|
||||
}
|
||||
|
||||
buff_v->syncer.clear();
|
||||
|
||||
@@ -83,10 +83,10 @@ rocprofiler_context_is_active(rocprofiler_context_id_t context_id, int* status)
|
||||
if(context_id.handle == rocprofiler_context_none.handle)
|
||||
return ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_FOUND;
|
||||
|
||||
for(const auto& itr : rocprofiler::context::get_active_contexts())
|
||||
auto ctxs = std::vector<const rocprofiler::context::context*>{};
|
||||
for(const auto* itr : rocprofiler::context::get_active_contexts(ctxs))
|
||||
{
|
||||
const auto* cfg = itr.load(std::memory_order_relaxed);
|
||||
if(cfg && cfg->context_idx == context_id.handle)
|
||||
if(itr && itr->context_idx == context_id.handle)
|
||||
{
|
||||
*status = 1;
|
||||
return ROCPROFILER_STATUS_SUCCESS;
|
||||
|
||||
@@ -20,10 +20,13 @@
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include <rocprofiler/buffer_tracing.h>
|
||||
#include <rocprofiler/fwd.h>
|
||||
#include <rocprofiler/rocprofiler.h>
|
||||
|
||||
#include "lib/common/container/stable_vector.hpp"
|
||||
#include "lib/common/synchronized.hpp"
|
||||
#include "lib/common/utility.hpp"
|
||||
#include "lib/rocprofiler/buffer.hpp"
|
||||
#include "lib/rocprofiler/context/context.hpp"
|
||||
#include "lib/rocprofiler/counters/core.hpp"
|
||||
@@ -33,6 +36,7 @@
|
||||
#include <unistd.h>
|
||||
#include <atomic>
|
||||
#include <cstddef>
|
||||
#include <deque>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <optional>
|
||||
@@ -43,6 +47,8 @@ namespace context
|
||||
{
|
||||
namespace
|
||||
{
|
||||
using reserve_size_t = common::container::reserve_size;
|
||||
|
||||
auto&
|
||||
get_contexts_mutex()
|
||||
{
|
||||
@@ -58,26 +64,16 @@ get_client_index()
|
||||
static auto _v = invalid_client_idx;
|
||||
return _v;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
uint64_t
|
||||
correlation_tracing_service::get_unique_internal_id()
|
||||
auto&
|
||||
get_num_active_contexts()
|
||||
{
|
||||
static auto _v = std::atomic<uint64_t>{};
|
||||
return _v++;
|
||||
}
|
||||
|
||||
using reserve_size_t = common::container::reserve_size;
|
||||
|
||||
unique_context_vec_t&
|
||||
get_registered_contexts()
|
||||
{
|
||||
static auto _v = unique_context_vec_t{reserve_size_t{unique_context_vec_t::chunk_size}};
|
||||
static auto _v = std::atomic<int64_t>{0};
|
||||
return _v;
|
||||
}
|
||||
|
||||
active_context_vec_t&
|
||||
get_active_contexts()
|
||||
get_active_contexts_impl()
|
||||
{
|
||||
static auto* _v = new active_context_vec_t{reserve_size_t{active_context_vec_t::chunk_size}};
|
||||
static auto _once = std::once_flag{};
|
||||
@@ -90,6 +86,97 @@ get_active_contexts()
|
||||
return *_v;
|
||||
}
|
||||
|
||||
auto&
|
||||
get_correlation_id_map()
|
||||
{
|
||||
static auto _v = common::Synchronized<std::vector<std::unique_ptr<correlation_id>>>{};
|
||||
return _v;
|
||||
}
|
||||
|
||||
auto*&
|
||||
get_latest_correlation_id_impl()
|
||||
{
|
||||
static thread_local correlation_id* _v = nullptr;
|
||||
return _v;
|
||||
}
|
||||
|
||||
uint64_t
|
||||
get_unique_internal_id()
|
||||
{
|
||||
static auto _v = std::atomic<uint64_t>{};
|
||||
return ++_v;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
correlation_id*
|
||||
correlation_tracing_service::construct(uint32_t _init_ref_count)
|
||||
{
|
||||
auto _internal_id = get_unique_internal_id();
|
||||
auto& corr_id_map = get_correlation_id_map();
|
||||
auto& ret = corr_id_map.wlock([](auto& data) -> auto& { return data.emplace_back(); });
|
||||
ret = std::make_unique<correlation_id>(_init_ref_count, common::get_tid(), _internal_id);
|
||||
|
||||
get_latest_correlation_id_impl() = ret.get();
|
||||
|
||||
return ret.get();
|
||||
}
|
||||
|
||||
correlation_id*
|
||||
get_latest_correlation_id()
|
||||
{
|
||||
return get_latest_correlation_id_impl();
|
||||
}
|
||||
|
||||
void
|
||||
pop_latest_correlation_id(const correlation_id* val)
|
||||
{
|
||||
if(get_latest_correlation_id_impl() == val) get_latest_correlation_id_impl() = nullptr;
|
||||
}
|
||||
|
||||
unique_context_vec_t&
|
||||
get_registered_contexts()
|
||||
{
|
||||
static auto _v = unique_context_vec_t{reserve_size_t{unique_context_vec_t::chunk_size}};
|
||||
return _v;
|
||||
}
|
||||
|
||||
std::vector<const context*>&
|
||||
get_active_contexts(std::vector<const context*>& data, context_filter_t filter)
|
||||
{
|
||||
data.clear();
|
||||
auto num_ctx = get_num_active_contexts().load(std::memory_order_acquire);
|
||||
if(num_ctx <= 0) return data;
|
||||
|
||||
data.reserve(num_ctx);
|
||||
for(auto& itr : get_active_contexts_impl())
|
||||
{
|
||||
const auto* ctx = itr.load(std::memory_order_acquire);
|
||||
if(ctx)
|
||||
{
|
||||
if(!filter || (filter && filter(ctx))) data.emplace_back(ctx);
|
||||
}
|
||||
if(static_cast<int64_t>(data.size()) == num_ctx)
|
||||
{
|
||||
// if the number of active contexts changed, restart
|
||||
if(num_ctx != get_num_active_contexts().load(std::memory_order_relaxed))
|
||||
{
|
||||
data.clear();
|
||||
return get_active_contexts(data, filter);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
std::vector<const context*>
|
||||
get_active_contexts(context_filter_t filter)
|
||||
{
|
||||
auto data = std::vector<const context*>{};
|
||||
get_active_contexts(data, filter);
|
||||
return data;
|
||||
}
|
||||
|
||||
// set the client index needs to be called before allocate_context()
|
||||
void
|
||||
push_client(uint32_t value)
|
||||
@@ -173,15 +260,30 @@ start_context(rocprofiler_context_id_t context_id)
|
||||
return ROCPROFILER_STATUS_ERROR_CONTEXT_INVALID;
|
||||
}
|
||||
|
||||
auto current_contexts = std::vector<const context*>{};
|
||||
for(const auto* itr : get_active_contexts(current_contexts))
|
||||
{
|
||||
if(cfg->context_idx == itr->context_idx)
|
||||
{
|
||||
return ROCPROFILER_STATUS_SUCCESS;
|
||||
}
|
||||
else if(cfg->counter_collection && itr->counter_collection)
|
||||
{
|
||||
// conflicting context
|
||||
return ROCPROFILER_STATUS_ERROR_CONTEXT_CONFLICT;
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t rocp_tot_contexts = get_registered_contexts().size();
|
||||
auto idx = rocp_tot_contexts;
|
||||
auto& active_contexts = get_active_contexts_impl();
|
||||
{
|
||||
// hold a lock here so prevent multiple threads from finding the same nullptr slot
|
||||
// hold a lock here to prevent multiple threads from finding the same nullptr slot
|
||||
auto _lk = std::unique_lock<std::mutex>{get_contexts_mutex()};
|
||||
// try to find a nullptr slot first
|
||||
for(size_t i = 0; i < get_active_contexts().size(); ++i)
|
||||
for(size_t i = 0; i < active_contexts.size(); ++i)
|
||||
{
|
||||
const auto* itr = get_active_contexts().at(i).load(std::memory_order_relaxed);
|
||||
const auto* itr = active_contexts.at(i).load(std::memory_order_relaxed);
|
||||
if(itr == nullptr)
|
||||
{
|
||||
idx = i;
|
||||
@@ -195,19 +297,25 @@ start_context(rocprofiler_context_id_t context_id)
|
||||
// if no nullptr slot was found, then create one while lock is held
|
||||
if(idx == rocp_tot_contexts)
|
||||
{
|
||||
idx = get_active_contexts().size();
|
||||
get_active_contexts().emplace_back();
|
||||
idx = active_contexts.size();
|
||||
active_contexts.emplace_back();
|
||||
}
|
||||
|
||||
get_num_active_contexts().fetch_add(1, std::memory_order_release);
|
||||
}
|
||||
|
||||
// atomic swap the pointer into the "active" array used internally
|
||||
const context* _expected = nullptr;
|
||||
bool success = get_active_contexts().at(idx).compare_exchange_strong(
|
||||
bool success = active_contexts.at(idx).compare_exchange_strong(
|
||||
_expected, get_registered_contexts().at(context_id.handle).get());
|
||||
|
||||
if(!success) return ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_STARTED;
|
||||
if(!success)
|
||||
{
|
||||
get_num_active_contexts().fetch_sub(1, std::memory_order_release);
|
||||
return ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_STARTED;
|
||||
}
|
||||
|
||||
rocprofiler::counters::start_context(context_id);
|
||||
if(cfg->counter_collection) rocprofiler::counters::start_context(cfg);
|
||||
|
||||
return ROCPROFILER_STATUS_SUCCESS;
|
||||
}
|
||||
@@ -215,17 +323,27 @@ start_context(rocprofiler_context_id_t context_id)
|
||||
rocprofiler_status_t
|
||||
stop_context(rocprofiler_context_id_t idx)
|
||||
{
|
||||
// hold a lock here to prevent other thread from changing the active contexts array
|
||||
auto _lk = std::unique_lock<std::mutex>{get_contexts_mutex()};
|
||||
|
||||
// atomically assign the context pointer to NULL so that it is skipped in future
|
||||
// callbacks
|
||||
for(auto& itr : get_active_contexts())
|
||||
for(auto& itr : get_active_contexts_impl())
|
||||
{
|
||||
const auto* _expected = itr.load(std::memory_order_relaxed);
|
||||
const context* _expected = itr.load(std::memory_order_acquire);
|
||||
if(_expected && _expected->context_idx == idx.handle)
|
||||
{
|
||||
bool success = itr.compare_exchange_strong(_expected, nullptr);
|
||||
|
||||
rocprofiler::counters::stop_context(idx);
|
||||
if(success) return ROCPROFILER_STATUS_SUCCESS;
|
||||
if(success)
|
||||
{
|
||||
auto nactive = get_num_active_contexts().load(std::memory_order_acquire);
|
||||
if(nactive > 0) get_num_active_contexts().fetch_sub(1, std::memory_order_release);
|
||||
|
||||
if(_expected->counter_collection)
|
||||
rocprofiler::counters::stop_context(const_cast<context*>(_expected));
|
||||
return ROCPROFILER_STATUS_SUCCESS;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -235,10 +353,10 @@ stop_context(rocprofiler_context_id_t idx)
|
||||
void
|
||||
deactivate_client_contexts(rocprofiler_client_id_t client_id)
|
||||
{
|
||||
for(auto& itr : get_active_contexts())
|
||||
for(auto& itr : get_active_contexts_impl())
|
||||
{
|
||||
const auto* itr_v = itr.load();
|
||||
if(itr_v->client_idx == client_id.handle)
|
||||
if(itr_v && itr_v->client_idx == client_id.handle)
|
||||
{
|
||||
itr.store(nullptr);
|
||||
}
|
||||
|
||||
@@ -46,12 +46,53 @@ using external_cid_cb_t = uint64_t (*)(rocprofiler_service_callback_tracing_kind
|
||||
uint32_t,
|
||||
uint64_t);
|
||||
|
||||
constexpr auto null_user_data = rocprofiler_user_data_t{.value = 0};
|
||||
struct correlation_id
|
||||
{
|
||||
// reference count starts at 5:
|
||||
// - decrement after begin callback/buffer API
|
||||
// - decrement after end callback/buffer API
|
||||
// - decrement after kernel dispatch/HW counters
|
||||
// - if PC sampling is not enabled, we can "retire" correlation id at ref count at 2
|
||||
// - if PC sampling is enabled, we decrement after each HSA buffer flush once ref count hits 2
|
||||
// - after the kernel dispatch completes, we know no more PC samples will be generated and
|
||||
// thus, after two HSA buffer flushes, we will have received all the PC samples for
|
||||
// the
|
||||
correlation_id(uint32_t _cnt, rocprofiler_thread_id_t _tid, uint64_t _internal) noexcept
|
||||
: ref_count{_cnt}
|
||||
, thread_idx{_tid}
|
||||
, internal{_internal}
|
||||
{}
|
||||
|
||||
correlation_id() = default;
|
||||
~correlation_id() = default;
|
||||
correlation_id(correlation_id&& val) noexcept = delete;
|
||||
correlation_id(const correlation_id&) = delete;
|
||||
|
||||
correlation_id& operator=(const correlation_id&) = delete;
|
||||
correlation_id& operator=(correlation_id&&) noexcept = delete;
|
||||
|
||||
std::atomic<uint32_t> ref_count = {};
|
||||
rocprofiler_thread_id_t thread_idx = 0;
|
||||
uint64_t internal = 0;
|
||||
};
|
||||
|
||||
correlation_id*
|
||||
get_correlation_id(rocprofiler_thread_id_t tid, uint64_t internal_id);
|
||||
|
||||
// latest correlation id for thread
|
||||
correlation_id*
|
||||
get_latest_correlation_id();
|
||||
|
||||
void
|
||||
pop_latest_correlation_id(const correlation_id*);
|
||||
|
||||
/// permits tools opportunity to modify the correlation id based on the domain, op, and
|
||||
/// the rocprofiler generated correlation id
|
||||
struct correlation_tracing_service
|
||||
{
|
||||
external_correlation::external_correlation external_correlator = {};
|
||||
static uint64_t get_unique_internal_id();
|
||||
static correlation_id* construct(uint32_t init_ref_count);
|
||||
};
|
||||
|
||||
struct callback_tracing_service
|
||||
@@ -139,8 +180,20 @@ using active_context_vec_t = common::container::stable_vector<std::atomic<const
|
||||
unique_context_vec_t&
|
||||
get_registered_contexts();
|
||||
|
||||
active_context_vec_t&
|
||||
get_active_contexts();
|
||||
using context_filter_t = bool (*)(const context*);
|
||||
|
||||
inline bool
|
||||
default_context_filter(const context* val)
|
||||
{
|
||||
return (val != nullptr);
|
||||
}
|
||||
|
||||
std::vector<const context*>&
|
||||
get_active_contexts(std::vector<const context*>& data,
|
||||
context_filter_t filter = default_context_filter);
|
||||
|
||||
std::vector<const context*>
|
||||
get_active_contexts(context_filter_t filter = default_context_filter);
|
||||
|
||||
void deactivate_client_contexts(rocprofiler_client_id_t);
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ std::unique_ptr<rocprofiler::hsa::AQLPacket>
|
||||
queue_cb(const std::shared_ptr<rocprofiler::counters::counter_callback_info>& info,
|
||||
const hsa::Queue& queue,
|
||||
hsa::ClientID,
|
||||
const hsa_ext_amd_aql_pm4_packet_t&)
|
||||
hsa::rocprofiler_packet)
|
||||
{
|
||||
if(!info) return nullptr;
|
||||
|
||||
@@ -95,7 +95,7 @@ void
|
||||
completed_cb(const std::shared_ptr<rocprofiler::counters::counter_callback_info>& info,
|
||||
const hsa::Queue& queue,
|
||||
hsa::ClientID,
|
||||
const hsa_ext_amd_aql_pm4_packet_t& kernel,
|
||||
hsa::rocprofiler_packet kernel,
|
||||
std::unique_ptr<rocprofiler::hsa::AQLPacket> pkt)
|
||||
{
|
||||
if(!info) return;
|
||||
@@ -127,7 +127,7 @@ completed_cb(const std::shared_ptr<rocprofiler::counters::counter_callback_info>
|
||||
info->user_cb(queue.get_id(),
|
||||
info->profile_cfg.agent,
|
||||
rocprofiler_correlation_id_t{},
|
||||
reinterpret_cast<const hsa_kernel_dispatch_packet_t*>(&kernel),
|
||||
&kernel.kernel_dispatch,
|
||||
info->callback_args,
|
||||
out.data(), // Date pointer does here.
|
||||
out.size(), // Number of objects
|
||||
@@ -215,31 +215,29 @@ destroy_counter_profile(uint64_t id)
|
||||
}
|
||||
|
||||
void
|
||||
start_context(rocprofiler_context_id_t context_id)
|
||||
start_context(context::context* ctx)
|
||||
{
|
||||
auto& ctx = *rocprofiler::context::get_registered_contexts().at(context_id.handle);
|
||||
if(!ctx || !ctx->counter_collection) return;
|
||||
|
||||
auto& controller = hsa::get_queue_controller();
|
||||
if(!ctx.counter_collection) return;
|
||||
|
||||
// Only one thread should be attempting to enable/disable this context
|
||||
ctx.counter_collection->enabled.wlock([&](auto& enabled) {
|
||||
ctx->counter_collection->enabled.wlock([&](auto& enabled) {
|
||||
if(enabled) return;
|
||||
for(auto& cb : ctx.counter_collection->callbacks)
|
||||
for(auto& cb : ctx->counter_collection->callbacks)
|
||||
{
|
||||
// Insert our callbacks into HSA Interceptor. This
|
||||
// turns on counter instrumentation.
|
||||
cb->queue_id = controller.add_callback(
|
||||
cb->profile_cfg.agent,
|
||||
[=](const hsa::Queue& q,
|
||||
hsa::ClientID c,
|
||||
const hsa_ext_amd_aql_pm4_packet_t& kern_pkt) {
|
||||
[=](const hsa::Queue& q, hsa::ClientID c, hsa::rocprofiler_packet kern_pkt) {
|
||||
return queue_cb(cb, q, c, kern_pkt);
|
||||
},
|
||||
// Completion CB
|
||||
[=](const hsa::Queue& q,
|
||||
hsa::ClientID c,
|
||||
const hsa_ext_amd_aql_pm4_packet_t& kern_pkt,
|
||||
std::unique_ptr<hsa::AQLPacket> aql) {
|
||||
[=](const hsa::Queue& q,
|
||||
hsa::ClientID c,
|
||||
hsa::rocprofiler_packet kern_pkt,
|
||||
std::unique_ptr<hsa::AQLPacket> aql) {
|
||||
completed_cb(cb, q, c, kern_pkt, std::move(aql));
|
||||
});
|
||||
}
|
||||
@@ -248,15 +246,15 @@ start_context(rocprofiler_context_id_t context_id)
|
||||
}
|
||||
|
||||
void
|
||||
stop_context(rocprofiler_context_id_t context_id)
|
||||
stop_context(context::context* ctx)
|
||||
{
|
||||
auto& controller = hsa::get_queue_controller();
|
||||
auto& ctx = *rocprofiler::context::get_registered_contexts().at(context_id.handle);
|
||||
if(!ctx.counter_collection) return;
|
||||
if(!ctx || !ctx->counter_collection) return;
|
||||
|
||||
ctx.counter_collection->enabled.wlock([&](auto& enabled) {
|
||||
auto& controller = hsa::get_queue_controller();
|
||||
|
||||
ctx->counter_collection->enabled.wlock([&](auto& enabled) {
|
||||
if(!enabled) return;
|
||||
for(auto& cb : ctx.counter_collection->callbacks)
|
||||
for(auto& cb : ctx->counter_collection->callbacks)
|
||||
{
|
||||
// Remove our callbacks from HSA's queue controller
|
||||
controller.remove_callback(cb->queue_id);
|
||||
|
||||
@@ -11,6 +11,10 @@
|
||||
|
||||
namespace rocprofiler
|
||||
{
|
||||
namespace context
|
||||
{
|
||||
struct context;
|
||||
}
|
||||
namespace counters
|
||||
{
|
||||
// Stores counter profiling information such as the agent
|
||||
@@ -55,15 +59,21 @@ struct counter_callback_info
|
||||
};
|
||||
|
||||
uint64_t
|
||||
create_counter_profile(profile_config&& config);
|
||||
void destroy_counter_profile(uint64_t);
|
||||
bool
|
||||
configure_dispatch(rocprofiler_context_id_t context_id,
|
||||
uint64_t profile_id,
|
||||
rocprofiler_profile_counting_dispatch_callback_t callback,
|
||||
void* callback_args);
|
||||
void start_context(rocprofiler_context_id_t);
|
||||
create_counter_profile(profile_config&& config);
|
||||
|
||||
void stop_context(rocprofiler_context_id_t);
|
||||
void
|
||||
destroy_counter_profile(uint64_t id);
|
||||
|
||||
bool
|
||||
configure_dispatch(rocprofiler_context_id_t context_id,
|
||||
uint64_t profile_id,
|
||||
rocprofiler_profile_counting_dispatch_callback_t callback,
|
||||
void* callback_args);
|
||||
|
||||
void
|
||||
start_context(context::context*);
|
||||
|
||||
void
|
||||
stop_context(context::context*);
|
||||
} // namespace counters
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -84,16 +84,16 @@ getBlockDimensions(const std::string& agent, const Metric& metric)
|
||||
{
|
||||
return std::vector<MetricDimension>{
|
||||
{dimension_map().at(ROCPROFILER_DIMENSION_SHADER_ENGINE),
|
||||
maybe_agent.get_rocp_agent().num_shader_banks,
|
||||
maybe_agent.get_rocp_agent()->num_shader_banks,
|
||||
ROCPROFILER_DIMENSION_SHADER_ENGINE},
|
||||
{dimension_map().at(ROCPROFILER_DIMENSION_XCC),
|
||||
maybe_agent.get_rocp_agent().num_xcc,
|
||||
maybe_agent.get_rocp_agent()->num_xcc,
|
||||
ROCPROFILER_DIMENSION_XCC},
|
||||
{dimension_map().at(ROCPROFILER_DIMENSION_CU),
|
||||
maybe_agent.get_rocp_agent().cu_count,
|
||||
maybe_agent.get_rocp_agent()->cu_count,
|
||||
ROCPROFILER_DIMENSION_CU},
|
||||
{dimension_map().at(ROCPROFILER_DIMENSION_AGENT),
|
||||
maybe_agent.get_rocp_agent().id.handle,
|
||||
maybe_agent.get_rocp_agent()->id.handle,
|
||||
ROCPROFILER_DIMENSION_AGENT}};
|
||||
|
||||
// auto query_info = aql::get_query_info(maybe_agent.get_agent(), metric);
|
||||
@@ -106,4 +106,4 @@ getBlockDimensions(const std::string& agent, const Metric& metric)
|
||||
}
|
||||
|
||||
} // namespace counters
|
||||
} // namespace rocprofiler
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -72,7 +72,7 @@ public:
|
||||
CONST_NONCONST_ACCESSOR(hsa_agent_t, get_hsa_agent, m_hsa_agent);
|
||||
CONST_NONCONST_ACCESSOR(hsa_agent_t, near_cpu, m_nearest_cpu);
|
||||
|
||||
const rocprofiler_agent_t& get_rocp_agent() const { return *m_rocp_agent; }
|
||||
const rocprofiler_agent_t* get_rocp_agent() const { return m_rocp_agent; }
|
||||
std::string_view name() const { return m_name; }
|
||||
size_t index() const { return m_index; }
|
||||
|
||||
|
||||
@@ -175,8 +175,6 @@ hsa_api_impl<Idx>::functor(Args&&... args)
|
||||
{
|
||||
using info_type = hsa_api_info<Idx>;
|
||||
|
||||
LOG(INFO) << __PRETTY_FUNCTION__;
|
||||
|
||||
struct callback_context_data
|
||||
{
|
||||
const context::context* ctx = nullptr;
|
||||
@@ -190,14 +188,17 @@ hsa_api_impl<Idx>::functor(Args&&... args)
|
||||
rocprofiler_user_data_t external_correlation = {};
|
||||
};
|
||||
|
||||
auto thr_id = common::get_tid();
|
||||
auto callback_contexts = std::vector<callback_context_data>{};
|
||||
auto buffered_contexts = std::vector<buffered_context_data>{};
|
||||
for(const auto& aitr : context::get_active_contexts())
|
||||
static thread_local auto active_contexts = std::vector<const context::context*>{};
|
||||
auto thr_id = common::get_tid();
|
||||
auto callback_contexts = std::vector<callback_context_data>{};
|
||||
auto buffered_contexts = std::vector<buffered_context_data>{};
|
||||
auto has_pc_sampling = false;
|
||||
for(const auto* itr : context::get_active_contexts(active_contexts))
|
||||
{
|
||||
const auto* itr = aitr.load();
|
||||
if(!itr) continue;
|
||||
|
||||
// if(itr->pc_sampler) has_pc_sampling = true;
|
||||
|
||||
if(itr->callback_tracer)
|
||||
{
|
||||
// if the given domain + op is not enabled, skip this context
|
||||
@@ -226,10 +227,16 @@ hsa_api_impl<Idx>::functor(Args&&... args)
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
constexpr auto empty_user_data = rocprofiler_user_data_t{.value = 0};
|
||||
auto buffer_record = rocprofiler_buffer_tracing_hsa_api_record_t{};
|
||||
auto tracer_data = rocprofiler_callback_tracing_hsa_api_data_t{};
|
||||
auto internal_corr_id = context::correlation_tracing_service::get_unique_internal_id();
|
||||
using correlation_service = context::correlation_tracing_service;
|
||||
using buffer_hsa_api_record_t = rocprofiler_buffer_tracing_hsa_api_record_t;
|
||||
using callback_hsa_api_data_t = rocprofiler_callback_tracing_hsa_api_data_t;
|
||||
|
||||
constexpr auto empty_user_data = rocprofiler_user_data_t{.value = 0};
|
||||
auto ref_count = (has_pc_sampling) ? 4 : 2;
|
||||
auto buffer_record = common::init_public_api_struct(buffer_hsa_api_record_t{});
|
||||
auto tracer_data = common::init_public_api_struct(callback_hsa_api_data_t{});
|
||||
auto* corr_id = correlation_service::construct(ref_count);
|
||||
auto internal_corr_id = corr_id->internal;
|
||||
|
||||
// construct the buffered info before the callback so the callbacks are as closely wrapped
|
||||
// around the function call as possible
|
||||
@@ -246,7 +253,6 @@ hsa_api_impl<Idx>::functor(Args&&... args)
|
||||
// invoke the callbacks
|
||||
if(!callback_contexts.empty())
|
||||
{
|
||||
tracer_data.size = sizeof(rocprofiler_callback_tracing_hsa_api_data_t);
|
||||
set_data_args(info_type::get_api_data_args(tracer_data.args), std::forward<Args>(args)...);
|
||||
|
||||
for(auto& itr : callback_contexts)
|
||||
@@ -255,12 +261,13 @@ hsa_api_impl<Idx>::functor(Args&&... args)
|
||||
auto& record = itr.record;
|
||||
auto& user_data = itr.user_data;
|
||||
|
||||
auto corr_id = rocprofiler_correlation_id_t{
|
||||
internal_corr_id, ctx->correlation_tracer.external_correlator.get(thr_id)};
|
||||
auto extern_corr_id_v = ctx->correlation_tracer.external_correlator.get(thr_id);
|
||||
|
||||
auto corr_id_v = rocprofiler_correlation_id_t{internal_corr_id, extern_corr_id_v};
|
||||
record =
|
||||
rocprofiler_callback_tracing_record_t{rocprofiler_context_id_t{ctx->context_idx},
|
||||
thr_id,
|
||||
corr_id,
|
||||
corr_id_v,
|
||||
info_type::callback_domain_idx,
|
||||
info_type::operation_idx,
|
||||
ROCPROFILER_CALLBACK_PHASE_ENTER,
|
||||
@@ -287,6 +294,9 @@ hsa_api_impl<Idx>::functor(Args&&... args)
|
||||
buffer_record.start_timestamp = common::timestamp_ns();
|
||||
}
|
||||
|
||||
// decrement the reference count before invoking
|
||||
corr_id->ref_count.fetch_sub(1);
|
||||
|
||||
auto _ret = exec(info_type::get_table_func(), std::forward<Args>(args)...);
|
||||
|
||||
// record the end timestamp as close to the function call as possible
|
||||
@@ -333,13 +343,18 @@ hsa_api_impl<Idx>::functor(Args&&... args)
|
||||
|
||||
bitr->emplace(ROCPROFILER_BUFFER_CATEGORY_TRACING,
|
||||
info_type::buffered_domain_idx,
|
||||
buffer_record);
|
||||
record_v);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// decrement the reference count after usage in the callback/buffers
|
||||
corr_id->ref_count.fetch_sub(1);
|
||||
|
||||
context::pop_latest_correlation_id(corr_id);
|
||||
|
||||
if constexpr(!std::is_same<decltype(_ret), null_type>::value)
|
||||
return _ret;
|
||||
else
|
||||
@@ -348,6 +363,8 @@ hsa_api_impl<Idx>::functor(Args&&... args)
|
||||
} // namespace hsa
|
||||
} // namespace rocprofiler
|
||||
|
||||
#define ROCPROFILER_LIB_ROCPROFILER_HSA_HSA_CPP_IMPL 1
|
||||
|
||||
// template specializations
|
||||
#include "hsa.def.cpp"
|
||||
|
||||
|
||||
@@ -18,10 +18,13 @@
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
|
||||
#include <rocprofiler/hsa.h>
|
||||
#if defined(ROCPROFILER_LIB_ROCPROFILER_HSA_HSA_CPP_IMPL) && \
|
||||
ROCPROFILER_LIB_ROCPROFILER_HSA_HSA_CPP_IMPL == 1
|
||||
|
||||
#include "defines.hpp"
|
||||
#include "hsa.hpp"
|
||||
# include <rocprofiler/hsa.h>
|
||||
|
||||
# include "defines.hpp"
|
||||
# include "hsa.hpp"
|
||||
|
||||
// clang-format off
|
||||
HSA_API_TABLE_LOOKUP_DEFINITION(ROCPROFILER_HSA_API_TABLE_ID_CoreApi, core_)
|
||||
@@ -223,7 +226,7 @@ HSA_API_INFO_DEFINITION_V(ROCPROFILER_HSA_API_TABLE_ID_AmdExt, ROCPROFILER_HSA_A
|
||||
HSA_API_INFO_DEFINITION_V(ROCPROFILER_HSA_API_TABLE_ID_AmdExt, ROCPROFILER_HSA_API_ID_hsa_amd_runtime_queue_create_register, hsa_amd_runtime_queue_create_register, hsa_amd_runtime_queue_create_register_fn, callback, user_data)
|
||||
// clang-format on
|
||||
|
||||
#if HSA_AMD_EXT_API_TABLE_MAJOR_VERSION >= 0x02
|
||||
# if HSA_AMD_EXT_API_TABLE_MAJOR_VERSION >= 0x02
|
||||
HSA_API_INFO_DEFINITION_V(ROCPROFILER_HSA_API_TABLE_ID_AmdExt,
|
||||
ROCPROFILER_HSA_API_ID_hsa_amd_vmem_address_reserve,
|
||||
hsa_amd_vmem_address_reserve,
|
||||
@@ -308,4 +311,8 @@ HSA_API_INFO_DEFINITION_V(ROCPROFILER_HSA_API_TABLE_ID_AmdExt,
|
||||
alloc_handle,
|
||||
pool,
|
||||
type)
|
||||
# endif
|
||||
|
||||
#else
|
||||
# error "Do not compile this file directly. It is included by lib/rocprofiler/hsa/hsa.cpp"
|
||||
#endif
|
||||
|
||||
@@ -19,8 +19,15 @@
|
||||
THE SOFTWARE. */
|
||||
|
||||
#include "lib/rocprofiler/hsa/queue.hpp"
|
||||
#include "lib/common/utility.hpp"
|
||||
#include "lib/rocprofiler/buffer.hpp"
|
||||
#include "lib/rocprofiler/context/context.hpp"
|
||||
|
||||
#include <glog/logging.h>
|
||||
#include <hsa/hsa.h>
|
||||
#include <hsa/hsa_ext_amd.h>
|
||||
#include <rocprofiler/fwd.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
#include <thread>
|
||||
@@ -76,11 +83,11 @@ AsyncSignalHandler(hsa_signal_value_t, void* data)
|
||||
queue_info_session.queue.core_api().hsa_signal_destroy_fn(
|
||||
queue_info_session.interrupt_signal);
|
||||
}
|
||||
if(queue_info_session.kernel_pkt.completion_signal.handle != 0u)
|
||||
if(queue_info_session.kernel_pkt.ext_amd_aql_pm4.completion_signal.handle != 0u)
|
||||
{
|
||||
signals_to_remove++;
|
||||
queue_info_session.queue.core_api().hsa_signal_destroy_fn(
|
||||
queue_info_session.kernel_pkt.completion_signal);
|
||||
queue_info_session.kernel_pkt.ext_amd_aql_pm4.completion_signal);
|
||||
}
|
||||
if(signals_to_remove > 0)
|
||||
{
|
||||
@@ -92,25 +99,6 @@ AsyncSignalHandler(hsa_signal_value_t, void* data)
|
||||
return false;
|
||||
}
|
||||
|
||||
void
|
||||
CreateBarrierPacket(const hsa_signal_t& packet_completion_signal,
|
||||
std::vector<hsa_ext_amd_aql_pm4_packet_t>& transformed_packets)
|
||||
{
|
||||
hsa_barrier_and_packet_t barrier{};
|
||||
barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
|
||||
barrier.dep_signal[0] = packet_completion_signal;
|
||||
void* barrier_ptr = &barrier;
|
||||
transformed_packets.emplace_back(*reinterpret_cast<hsa_ext_amd_aql_pm4_packet_t*>(barrier_ptr));
|
||||
}
|
||||
|
||||
void
|
||||
AddVendorSpecificPacket(const hsa_ext_amd_aql_pm4_packet_t& packet,
|
||||
std::vector<hsa_ext_amd_aql_pm4_packet_t>& transformed_packets,
|
||||
const hsa_signal_t& packet_completion_signal)
|
||||
{
|
||||
transformed_packets.emplace_back(packet).completion_signal = packet_completion_signal;
|
||||
}
|
||||
|
||||
template <typename Integral = uint64_t>
|
||||
constexpr Integral
|
||||
bit_mask(int first, int last)
|
||||
@@ -145,48 +133,68 @@ WriteInterceptor(const void* packets,
|
||||
void* data,
|
||||
hsa_amd_queue_intercept_packet_writer writer)
|
||||
{
|
||||
Queue& queue_info = *static_cast<Queue*>(data);
|
||||
auto&& AddVendorSpecificPacket = [](hsa_ext_amd_aql_pm4_packet_t _packet,
|
||||
hsa_signal_t _signal,
|
||||
std::vector<rocprofiler_packet>& _packets) {
|
||||
_packets.emplace_back(_packet).ext_amd_aql_pm4.completion_signal = _signal;
|
||||
};
|
||||
|
||||
auto&& CreateBarrierPacket = [](hsa_signal_t _signal,
|
||||
std::vector<rocprofiler_packet>& _packets) {
|
||||
hsa_barrier_and_packet_t barrier{};
|
||||
barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
|
||||
barrier.dep_signal[0] = _signal;
|
||||
_packets.emplace_back(barrier);
|
||||
};
|
||||
|
||||
LOG_IF(FATAL, data == nullptr) << "WriteInterceptor was not passed a pointer to the queue";
|
||||
|
||||
auto& queue = *static_cast<Queue*>(data);
|
||||
auto thr_id = common::get_tid();
|
||||
auto* corr_id = context::get_latest_correlation_id();
|
||||
// increase the reference count to denote that this correlation id is being used in a kernel
|
||||
if(corr_id) corr_id->ref_count.fetch_add(1);
|
||||
|
||||
// We have no packets or no one who needs to be notified, do nothing.
|
||||
if(pkt_count == 0 || queue_info.get_notifiers() == 0)
|
||||
if(pkt_count == 0 || queue.get_notifiers() == 0)
|
||||
{
|
||||
writer(packets, pkt_count);
|
||||
return;
|
||||
}
|
||||
|
||||
// hsa_ext_amd_aql_pm4_packet_t
|
||||
const hsa_ext_amd_aql_pm4_packet_t* packets_arr =
|
||||
static_cast<const hsa_ext_amd_aql_pm4_packet_t*>(packets);
|
||||
std::vector<hsa_ext_amd_aql_pm4_packet_t> transformed_packets;
|
||||
const auto* packets_arr = static_cast<const rocprofiler_packet*>(packets);
|
||||
auto transformed_packets = std::vector<rocprofiler_packet>{};
|
||||
|
||||
// Searching accross all the packets given during this write
|
||||
for(size_t i = 0; i < pkt_count; ++i)
|
||||
{
|
||||
const auto& original_packet = static_cast<const hsa_barrier_and_packet_t*>(packets)[i];
|
||||
if(bit_extract(original_packet.header,
|
||||
HSA_PACKET_HEADER_TYPE,
|
||||
HSA_PACKET_HEADER_TYPE + HSA_PACKET_HEADER_WIDTH_TYPE - 1) !=
|
||||
HSA_PACKET_TYPE_KERNEL_DISPATCH)
|
||||
const auto& original_packet = packets_arr[i].kernel_dispatch;
|
||||
auto packet_type = bit_extract(original_packet.header,
|
||||
HSA_PACKET_HEADER_TYPE,
|
||||
HSA_PACKET_HEADER_TYPE + HSA_PACKET_HEADER_WIDTH_TYPE - 1);
|
||||
if(packet_type != HSA_PACKET_TYPE_KERNEL_DISPATCH)
|
||||
{
|
||||
transformed_packets.emplace_back(packets_arr[i]);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Copy kernel pkt, copy is to allow for signal to be modified
|
||||
hsa_ext_amd_aql_pm4_packet_t kernel_pkt = packets_arr[i];
|
||||
queue_info.create_signal(HSA_AMD_SIGNAL_AMD_GPU_ONLY, &kernel_pkt.completion_signal);
|
||||
rocprofiler_packet kernel_pkt = packets_arr[i];
|
||||
queue.create_signal(HSA_AMD_SIGNAL_AMD_GPU_ONLY,
|
||||
&kernel_pkt.ext_amd_aql_pm4.completion_signal);
|
||||
|
||||
// Stores the instrumentation pkt (i.e. AQL packets for counter collection)
|
||||
// along with an ID of the client we got the packet from (this will be returned via
|
||||
// CompletedCB)
|
||||
// completed_cb_t)
|
||||
ClientID inst_pkt_id = -1;
|
||||
std::unique_ptr<AQLPacket> inst_pkt;
|
||||
|
||||
// Signal callbacks that a kernel_pkt is being enqueued
|
||||
queue_info.signal_callback([&](const auto& map) {
|
||||
queue.signal_callback([&](const auto& map) {
|
||||
for(const auto& [client_id, cb_pair] : map)
|
||||
{
|
||||
if(auto maybe_pkt = cb_pair.first(queue_info, client_id, kernel_pkt))
|
||||
if(auto maybe_pkt = cb_pair.first(queue, client_id, kernel_pkt))
|
||||
{
|
||||
LOG_IF(FATAL, inst_pkt)
|
||||
<< "We do not support two injections into the HSA queue";
|
||||
@@ -196,14 +204,13 @@ WriteInterceptor(const void* packets,
|
||||
}
|
||||
});
|
||||
|
||||
constexpr auto dummy_signal = hsa_signal_t{.handle = 0};
|
||||
|
||||
// Write instrumentation start packet (if one exists)
|
||||
if(inst_pkt)
|
||||
{
|
||||
hsa_signal_t dummy_signal{};
|
||||
dummy_signal.handle = 0;
|
||||
inst_pkt->start.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE;
|
||||
AddVendorSpecificPacket(inst_pkt->start, transformed_packets, dummy_signal);
|
||||
|
||||
AddVendorSpecificPacket(inst_pkt->start, dummy_signal, transformed_packets);
|
||||
CreateBarrierPacket(inst_pkt->start.completion_signal, transformed_packets);
|
||||
}
|
||||
|
||||
@@ -214,25 +221,21 @@ WriteInterceptor(const void* packets,
|
||||
if(original_packet.completion_signal.handle != 0u)
|
||||
{
|
||||
hsa_barrier_and_packet_t barrier{};
|
||||
barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
|
||||
hsa_ext_amd_aql_pm4_packet_t* __attribute__((__may_alias__)) pkt =
|
||||
(reinterpret_cast<hsa_ext_amd_aql_pm4_packet_t*>(&barrier));
|
||||
transformed_packets.emplace_back(*pkt).completion_signal =
|
||||
original_packet.completion_signal;
|
||||
barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
|
||||
barrier.completion_signal = original_packet.completion_signal;
|
||||
transformed_packets.emplace_back(barrier);
|
||||
}
|
||||
|
||||
hsa_signal_t interrupt_signal{};
|
||||
// Adding a barrier packet with the original packet's completion signal.
|
||||
queue_info.create_signal(0, &interrupt_signal);
|
||||
queue.create_signal(0, &interrupt_signal);
|
||||
|
||||
if(inst_pkt)
|
||||
{
|
||||
hsa_signal_t dummy_signal{};
|
||||
dummy_signal.handle = 0;
|
||||
inst_pkt->stop.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE;
|
||||
AddVendorSpecificPacket(inst_pkt->stop, transformed_packets, dummy_signal);
|
||||
AddVendorSpecificPacket(inst_pkt->stop, dummy_signal, transformed_packets);
|
||||
inst_pkt->read.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE;
|
||||
AddVendorSpecificPacket(inst_pkt->read, transformed_packets, interrupt_signal);
|
||||
AddVendorSpecificPacket(inst_pkt->read, interrupt_signal, transformed_packets);
|
||||
|
||||
// Added Interrupt Signal with barrier and provided handler for it
|
||||
CreateBarrierPacket(interrupt_signal, transformed_packets);
|
||||
@@ -242,21 +245,25 @@ WriteInterceptor(const void* packets,
|
||||
hsa_barrier_and_packet_t barrier{};
|
||||
barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
|
||||
barrier.completion_signal = interrupt_signal;
|
||||
hsa_ext_amd_aql_pm4_packet_t* __attribute__((__may_alias__)) pkt =
|
||||
(reinterpret_cast<hsa_ext_amd_aql_pm4_packet_t*>(&barrier));
|
||||
transformed_packets.emplace_back(*pkt);
|
||||
transformed_packets.emplace_back(barrier);
|
||||
}
|
||||
|
||||
// TODO(jrmadsen): fetch kernel identifier from code object loading
|
||||
uint64_t kernel_id = 0;
|
||||
|
||||
// Enqueue the signal into the handler. Will call completed_cb when
|
||||
// signal completes.
|
||||
queue_info.async_started();
|
||||
queue_info.signal_async_handler(
|
||||
queue.async_started();
|
||||
queue.signal_async_handler(
|
||||
interrupt_signal,
|
||||
new Queue::queue_info_session_t{.queue = queue_info,
|
||||
new Queue::queue_info_session_t{.queue = queue,
|
||||
.inst_pkt = std::move(inst_pkt),
|
||||
.inst_pkt_id = inst_pkt_id,
|
||||
.kernel_pkt = kernel_pkt,
|
||||
.interrupt_signal = interrupt_signal});
|
||||
.interrupt_signal = interrupt_signal,
|
||||
.tid = thr_id,
|
||||
.kernel_id = kernel_id,
|
||||
.correlation_id = corr_id,
|
||||
.kernel_pkt = kernel_pkt});
|
||||
}
|
||||
|
||||
writer(transformed_packets.data(), transformed_packets.size());
|
||||
@@ -329,7 +336,7 @@ Queue::Queue(const AgentCache& agent,
|
||||
}
|
||||
|
||||
void
|
||||
Queue::register_callback(ClientID id, QueueCB enqueue_cb, CompletedCB complete_cb)
|
||||
Queue::register_callback(ClientID id, queue_cb_t enqueue_cb, completed_cb_t complete_cb)
|
||||
{
|
||||
_callbacks.wlock([&](auto& map) {
|
||||
LOG_IF(FATAL, rocprofiler::common::get_val(map, id)) << "ID already exists!";
|
||||
|
||||
@@ -20,9 +20,11 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <rocprofiler/buffer_tracing.h>
|
||||
#include <rocprofiler/fwd.h>
|
||||
|
||||
#include "lib/common/synchronized.hpp"
|
||||
#include "lib/common/utility.hpp"
|
||||
#include "lib/rocprofiler/hsa/agent_cache.hpp"
|
||||
#include "lib/rocprofiler/hsa/aql_packet.hpp"
|
||||
|
||||
@@ -30,6 +32,7 @@
|
||||
#include <hsa/hsa.h>
|
||||
#include <hsa/hsa_api_trace.h>
|
||||
#include <hsa/hsa_ext_amd.h>
|
||||
#include <hsa/hsa_ven_amd_aqlprofile.h>
|
||||
#include <hsa/hsa_ven_amd_loader.h>
|
||||
|
||||
#include <atomic>
|
||||
@@ -42,10 +45,49 @@
|
||||
|
||||
namespace rocprofiler
|
||||
{
|
||||
namespace context
|
||||
{
|
||||
struct correlation_id;
|
||||
}
|
||||
namespace hsa
|
||||
{
|
||||
using ClientID = int64_t;
|
||||
|
||||
union rocprofiler_packet
|
||||
{
|
||||
hsa_ext_amd_aql_pm4_packet_t ext_amd_aql_pm4;
|
||||
hsa_kernel_dispatch_packet_t kernel_dispatch;
|
||||
hsa_barrier_and_packet_t barrier_and;
|
||||
hsa_barrier_or_packet_t barrier_or;
|
||||
|
||||
rocprofiler_packet()
|
||||
: ext_amd_aql_pm4{null_amd_aql_pm4_packet}
|
||||
{}
|
||||
|
||||
rocprofiler_packet(hsa_ext_amd_aql_pm4_packet_t val)
|
||||
: ext_amd_aql_pm4{val}
|
||||
{}
|
||||
|
||||
rocprofiler_packet(hsa_kernel_dispatch_packet_t val)
|
||||
: kernel_dispatch{val}
|
||||
{}
|
||||
|
||||
rocprofiler_packet(hsa_barrier_and_packet_t val)
|
||||
: barrier_and{val}
|
||||
{}
|
||||
|
||||
rocprofiler_packet(hsa_barrier_or_packet_t val)
|
||||
: barrier_or{val}
|
||||
{}
|
||||
|
||||
~rocprofiler_packet() = default;
|
||||
rocprofiler_packet(const rocprofiler_packet&) = default;
|
||||
rocprofiler_packet(rocprofiler_packet&&) noexcept = default;
|
||||
|
||||
rocprofiler_packet& operator=(const rocprofiler_packet&) = default;
|
||||
rocprofiler_packet& operator=(rocprofiler_packet&&) noexcept = default;
|
||||
};
|
||||
|
||||
// Interceptor for a single specific queue
|
||||
class Queue
|
||||
{
|
||||
@@ -54,24 +96,25 @@ public:
|
||||
// Function prototype used to notify consumers that a kernel has been
|
||||
// enqueued. An AQL packet can be returned that will be injected into
|
||||
// the queue.
|
||||
using QueueCB = std::function<
|
||||
std::unique_ptr<AQLPacket>(const Queue&, ClientID, const hsa_ext_amd_aql_pm4_packet_t&)>;
|
||||
using queue_cb_t = std::function<
|
||||
std::unique_ptr<AQLPacket>(const Queue&, ClientID, const rocprofiler_packet&)>;
|
||||
// Signals the completion of the kernel packet.
|
||||
using CompletedCB = std::function<void(const Queue&,
|
||||
ClientID,
|
||||
const hsa_ext_amd_aql_pm4_packet_t&,
|
||||
std::unique_ptr<AQLPacket>)>;
|
||||
using callback_map_t = std::unordered_map<ClientID, std::pair<QueueCB, CompletedCB>>;
|
||||
using completed_cb_t = std::function<
|
||||
void(const Queue&, ClientID, const rocprofiler_packet&, std::unique_ptr<AQLPacket>)>;
|
||||
using callback_map_t = std::unordered_map<ClientID, std::pair<queue_cb_t, completed_cb_t>>;
|
||||
|
||||
// Internal session information that is used by write interceptor
|
||||
// to track state of the intercepted kernel.
|
||||
struct queue_info_session_t
|
||||
{
|
||||
Queue& queue;
|
||||
std::unique_ptr<AQLPacket> inst_pkt = {};
|
||||
ClientID inst_pkt_id = 0;
|
||||
hsa_ext_amd_aql_pm4_packet_t kernel_pkt = null_amd_aql_pm4_packet;
|
||||
hsa_signal_t interrupt_signal = {};
|
||||
Queue& queue;
|
||||
std::unique_ptr<AQLPacket> inst_pkt = {};
|
||||
ClientID inst_pkt_id = 0;
|
||||
hsa_signal_t interrupt_signal = {};
|
||||
rocprofiler_thread_id_t tid = common::get_tid();
|
||||
rocprofiler_kernel_id_t kernel_id = 0;
|
||||
context::correlation_id* correlation_id = nullptr;
|
||||
rocprofiler_packet kernel_pkt = {};
|
||||
};
|
||||
|
||||
Queue(const AgentCache& agent,
|
||||
@@ -93,16 +136,10 @@ public:
|
||||
void create_signal(uint32_t attribute, hsa_signal_t* signal) const;
|
||||
void signal_async_handler(const hsa_signal_t& signal, Queue::queue_info_session_t* data) const;
|
||||
|
||||
rocprofiler_queue_id_t get_id() const
|
||||
{
|
||||
return {.handle = reinterpret_cast<uint64_t>(intercept_queue())};
|
||||
};
|
||||
template <typename FuncT>
|
||||
void signal_callback(FuncT&& func) const;
|
||||
|
||||
template <class Func>
|
||||
void signal_callback(Func&& func) const
|
||||
{
|
||||
_callbacks.rlock([&func](const auto& data) { func(data); });
|
||||
}
|
||||
rocprofiler_queue_id_t get_id() const;
|
||||
|
||||
// Fast check to see if we have any callbacks we need to notify
|
||||
int get_notifiers() const { return _notifiers; }
|
||||
@@ -113,7 +150,7 @@ public:
|
||||
void async_started() { _active_async_packets++; }
|
||||
void async_complete() { _active_async_packets--; }
|
||||
|
||||
void register_callback(ClientID id, QueueCB enqueue_cb, CompletedCB complete_cb);
|
||||
void register_callback(ClientID id, queue_cb_t enqueue_cb, completed_cb_t complete_cb);
|
||||
void remove_callback(ClientID id);
|
||||
|
||||
const CoreApiTable& core_api() const { return _core_api; }
|
||||
@@ -129,5 +166,18 @@ private:
|
||||
hsa_queue_t* _intercept_queue = nullptr;
|
||||
};
|
||||
|
||||
inline rocprofiler_queue_id_t
|
||||
Queue::get_id() const
|
||||
{
|
||||
return {.handle = reinterpret_cast<uint64_t>(intercept_queue())};
|
||||
};
|
||||
|
||||
template <typename FuncT>
|
||||
inline void
|
||||
Queue::signal_callback(FuncT&& func) const
|
||||
{
|
||||
_callbacks.rlock([&func](const auto& data) { func(data); });
|
||||
}
|
||||
|
||||
} // namespace hsa
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -80,7 +80,7 @@ QueueController::add_queue(hsa_queue_t* id, std::unique_ptr<Queue> queue)
|
||||
CHECK(queue);
|
||||
_callback_cache.wlock([&](auto& callbacks) {
|
||||
_queues.wlock([&](auto& map) {
|
||||
const auto agent_id = queue->get_agent().get_rocp_agent().id.handle;
|
||||
const auto agent_id = queue->get_agent().get_rocp_agent()->id.handle;
|
||||
map[id] = std::move(queue);
|
||||
for(const auto& [cbid, cb_tuple] : callbacks)
|
||||
{
|
||||
@@ -102,8 +102,8 @@ QueueController::destory_queue(hsa_queue_t* id)
|
||||
|
||||
ClientID
|
||||
QueueController::add_callback(const rocprofiler_agent_t& agent,
|
||||
Queue::QueueCB qcb,
|
||||
Queue::CompletedCB ccb)
|
||||
Queue::queue_cb_t qcb,
|
||||
Queue::completed_cb_t ccb)
|
||||
{
|
||||
static std::atomic<ClientID> client_id = 1;
|
||||
ClientID return_id;
|
||||
@@ -114,7 +114,7 @@ QueueController::add_callback(const rocprofiler_agent_t& agent,
|
||||
_queues.wlock([&](auto& map) {
|
||||
for(auto& [_, queue] : map)
|
||||
{
|
||||
if(queue->get_agent().get_rocp_agent().id.handle == agent.id.handle)
|
||||
if(queue->get_agent().get_rocp_agent()->id.handle == agent.id.handle)
|
||||
{
|
||||
queue->register_callback(return_id, qcb, ccb);
|
||||
}
|
||||
@@ -150,7 +150,7 @@ QueueController::init(CoreApiTable& core_table, AmdExtTable& ext_table)
|
||||
for(const auto* itr : agents)
|
||||
{
|
||||
auto cached_agent = agent::get_agent_cache(itr);
|
||||
if(cached_agent && cached_agent->get_rocp_agent().type == ROCPROFILER_AGENT_TYPE_GPU)
|
||||
if(cached_agent && cached_agent->get_rocp_agent()->type == ROCPROFILER_AGENT_TYPE_GPU)
|
||||
{
|
||||
get_supported_agents().emplace(cached_agent->index(), *cached_agent);
|
||||
}
|
||||
@@ -188,6 +188,20 @@ QueueController::init(CoreApiTable& core_table, AmdExtTable& ext_table)
|
||||
}
|
||||
}
|
||||
|
||||
const Queue*
|
||||
QueueController::get_queue(const hsa_queue_t& _hsa_queue) const
|
||||
{
|
||||
return _queues.rlock(
|
||||
[](const queue_map_t& _data, const hsa_queue_t& _inp) -> const Queue* {
|
||||
for(const auto& itr : _data)
|
||||
{
|
||||
if(itr.first->id == _inp.id) return itr.second.get();
|
||||
}
|
||||
return nullptr;
|
||||
},
|
||||
_hsa_queue);
|
||||
}
|
||||
|
||||
QueueController&
|
||||
get_queue_controller()
|
||||
{
|
||||
|
||||
@@ -47,7 +47,7 @@ public:
|
||||
|
||||
// Add callback to queues associated with the agent. Returns a client
|
||||
// id that can be used by callers to remove the callback.
|
||||
ClientID add_callback(const rocprofiler_agent_t&, Queue::QueueCB, Queue::CompletedCB);
|
||||
ClientID add_callback(const rocprofiler_agent_t&, Queue::queue_cb_t, Queue::completed_cb_t);
|
||||
void remove_callback(ClientID);
|
||||
|
||||
const CoreApiTable& get_core_table() const { return _core_table; }
|
||||
@@ -57,9 +57,11 @@ public:
|
||||
const auto& get_supported_agents() const { return _supported_agents; }
|
||||
auto& get_supported_agents() { return _supported_agents; }
|
||||
|
||||
const Queue* get_queue(const hsa_queue_t&) const;
|
||||
|
||||
private:
|
||||
using agent_callback_tuple_t =
|
||||
std::tuple<rocprofiler_agent_t, Queue::QueueCB, Queue::CompletedCB>;
|
||||
std::tuple<rocprofiler_agent_t, Queue::queue_cb_t, Queue::completed_cb_t>;
|
||||
using queue_map_t = std::unordered_map<hsa_queue_t*, std::unique_ptr<Queue>>;
|
||||
using client_id_map_t = std::unordered_map<ClientID, agent_callback_tuple_t>;
|
||||
using agent_cache_map_t = std::unordered_map<uint32_t, AgentCache>;
|
||||
|
||||
@@ -41,18 +41,14 @@ namespace internal_threading
|
||||
{
|
||||
using thread_pool_t = PTL::ThreadPool;
|
||||
using task_group_t = PTL::TaskGroup<void>;
|
||||
using thread_pool_cleanup_t = rocprofiler::common::static_cleanup_wrapper<
|
||||
std::unique_ptr<thread_pool_t>,
|
||||
std::function<void(std::unique_ptr<thread_pool_t>&)>>;
|
||||
using thread_pool_cleanup_t = common::static_cleanup_wrapper<std::unique_ptr<thread_pool_t>>;
|
||||
using task_group_cleanup_t =
|
||||
std::pair<std::unique_ptr<task_group_t>, std::shared_ptr<thread_pool_cleanup_t>>;
|
||||
using thread_pool_vec_t = std::vector<std::shared_ptr<thread_pool_cleanup_t>>;
|
||||
|
||||
// Note: task_group maintains a shared_ptr copy to thread_pool to ensure it is not destroyed
|
||||
// before the task can be sync'd.
|
||||
using task_group_vec_t = rocprofiler::common::static_cleanup_wrapper<
|
||||
std::vector<task_group_cleanup_t>,
|
||||
std::function<void(std::vector<task_group_cleanup_t>&)>>;
|
||||
using task_group_vec_t = common::static_cleanup_wrapper<std::vector<task_group_cleanup_t>>;
|
||||
|
||||
void notify_pre_internal_thread_create(rocprofiler_runtime_library_t);
|
||||
void notify_post_internal_thread_create(rocprofiler_runtime_library_t);
|
||||
|
||||
@@ -351,6 +351,7 @@ invoke_client_finalizers()
|
||||
// set to nullptr so finalize only gets called once
|
||||
itr.configure_result->finalize = nullptr;
|
||||
}
|
||||
context::deactivate_client_contexts(itr.internal_client_id);
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -377,6 +378,7 @@ invoke_client_finalizer(rocprofiler_client_id_t client_id)
|
||||
// set to nullptr so finalize only gets called once
|
||||
itr.configure_result->finalize = nullptr;
|
||||
}
|
||||
context::deactivate_client_contexts(itr.internal_client_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -472,8 +474,6 @@ finalize()
|
||||
if(get_init_status() > 0)
|
||||
{
|
||||
invoke_client_finalizers();
|
||||
for(auto& itr : rocprofiler::context::get_active_contexts())
|
||||
itr.store(nullptr, std::memory_order_seq_cst);
|
||||
}
|
||||
internal_threading::finalize();
|
||||
set_fini_status(1);
|
||||
|
||||
@@ -52,9 +52,9 @@ ROCPROFILER_STATUS_STRING(ROCPROFILER_STATUS_ERROR_AGENT_NOT_FOUND, "Agent ID no
|
||||
ROCPROFILER_STATUS_STRING(ROCPROFILER_STATUS_ERROR_COUNTER_NOT_FOUND, "HW counter not found")
|
||||
ROCPROFILER_STATUS_STRING(ROCPROFILER_STATUS_ERROR_CONTEXT_INVALID,
|
||||
"Context configuration is not valid")
|
||||
ROCPROFILER_STATUS_STRING(
|
||||
ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_STARTED,
|
||||
"Context failed to be started (may be already started or atomic swap may have failed)")
|
||||
ROCPROFILER_STATUS_STRING(ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_STARTED, "Context failed to start")
|
||||
ROCPROFILER_STATUS_STRING(ROCPROFILER_STATUS_ERROR_CONTEXT_CONFLICT,
|
||||
"Context has a conflict with another context")
|
||||
ROCPROFILER_STATUS_STRING(
|
||||
ROCPROFILER_STATUS_ERROR_BUFFER_BUSY,
|
||||
"Buffer operation failed because it is currently busy handling another request")
|
||||
|
||||
@@ -32,6 +32,7 @@
|
||||
#include <dlfcn.h>
|
||||
#include <pthread.h>
|
||||
#include <chrono>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <filesystem>
|
||||
@@ -483,8 +484,11 @@ TEST(rocprofiler_lib, callback_registration_lambda_with_result)
|
||||
EXPECT_GT(elapsed, 0);
|
||||
#else
|
||||
decltype(elapsed) elapsed_tolerance = 0.25 * elapsed;
|
||||
int64_t diff = (cb_data.client_elapsed - elapsed);
|
||||
auto frac = std::abs(diff) / (1.0 * elapsed);
|
||||
EXPECT_NEAR(elapsed, cb_data.client_elapsed, elapsed_tolerance)
|
||||
<< "it is possible this failed due to noise on the machine";
|
||||
<< "% diff = " << std::fixed << std::setprecision(3) << (100.0 * frac)
|
||||
<< "%. It is possible this failed due to noise on the machine";
|
||||
#endif
|
||||
|
||||
ASSERT_NE(cb_data.client_id, nullptr);
|
||||
|
||||
Ссылка в новой задаче
Block a user