Run clang-format on all source files

Change-Id: Ifb52ca306286b6b2d473821bed9db28e9f616d50
Этот коммит содержится в:
Laurent Morichetti
2022-04-18 15:09:58 -07:00
коммит произвёл Laurent Morichetti
родитель 89f6880371
Коммит 15ab5d9cda
40 изменённых файлов: 1332 добавлений и 1228 удалений
+43 -47
Просмотреть файл
@@ -25,35 +25,30 @@
// Traced API domains
typedef enum {
ACTIVITY_DOMAIN_HSA_API = 0, // HSA API domain
ACTIVITY_DOMAIN_HSA_OPS = 1, // HSA async activity domain
ACTIVITY_DOMAIN_HIP_OPS = 2, // HIP async activity domain
ACTIVITY_DOMAIN_HCC_OPS = ACTIVITY_DOMAIN_HIP_OPS, // HCC async activity domain
ACTIVITY_DOMAIN_HIP_VDI = ACTIVITY_DOMAIN_HIP_OPS, // HIP VDI async activity domain
ACTIVITY_DOMAIN_HIP_API = 3, // HIP API domain
ACTIVITY_DOMAIN_KFD_API = 4, // KFD API domain
ACTIVITY_DOMAIN_EXT_API = 5, // External ID domain
ACTIVITY_DOMAIN_ROCTX = 6, // ROCTX domain
ACTIVITY_DOMAIN_HSA_EVT = 7, // HSA events
ACTIVITY_DOMAIN_HSA_API = 0, // HSA API domain
ACTIVITY_DOMAIN_HSA_OPS = 1, // HSA async activity domain
ACTIVITY_DOMAIN_HIP_OPS = 2, // HIP async activity domain
ACTIVITY_DOMAIN_HCC_OPS = ACTIVITY_DOMAIN_HIP_OPS, // HCC async activity domain
ACTIVITY_DOMAIN_HIP_VDI = ACTIVITY_DOMAIN_HIP_OPS, // HIP VDI async activity domain
ACTIVITY_DOMAIN_HIP_API = 3, // HIP API domain
ACTIVITY_DOMAIN_KFD_API = 4, // KFD API domain
ACTIVITY_DOMAIN_EXT_API = 5, // External ID domain
ACTIVITY_DOMAIN_ROCTX = 6, // ROCTX domain
ACTIVITY_DOMAIN_HSA_EVT = 7, // HSA events
ACTIVITY_DOMAIN_NUMBER
} activity_domain_t;
// Extension API opcodes
typedef enum {
ACTIVITY_EXT_OP_MARK = 0,
ACTIVITY_EXT_OP_EXTERN_ID = 1
} activity_ext_op_t;
typedef enum { ACTIVITY_EXT_OP_MARK = 0, ACTIVITY_EXT_OP_EXTERN_ID = 1 } activity_ext_op_t;
// API calback type
typedef void (*activity_rtapi_callback_t)(uint32_t domain, uint32_t cid, const void* data, void* arg);
typedef void (*activity_rtapi_callback_t)(uint32_t domain, uint32_t cid, const void* data,
void* arg);
typedef uint32_t activity_kind_t;
typedef uint32_t activity_op_t;
// API callback phase
typedef enum {
ACTIVITY_API_PHASE_ENTER = 0,
ACTIVITY_API_PHASE_EXIT = 1
} activity_api_phase_t;
typedef enum { ACTIVITY_API_PHASE_ENTER = 0, ACTIVITY_API_PHASE_EXIT = 1 } activity_api_phase_t;
// Trace record types
// Correlation id
@@ -61,39 +56,40 @@ typedef uint64_t activity_correlation_id_t;
// Activity record type
typedef struct activity_record_s {
uint32_t domain; // activity domain id
activity_kind_t kind; // activity kind
activity_op_t op; // activity op
union {
struct {
activity_correlation_id_t correlation_id; // activity ID
uint64_t begin_ns; // host begin timestamp
uint64_t end_ns; // host end timestamp
};
struct {
uint32_t se; // sampled SE
uint64_t cycle; // sample cycle
uint64_t pc; // sample PC
} pc_sample;
uint32_t domain; // activity domain id
activity_kind_t kind; // activity kind
activity_op_t op; // activity op
union {
struct {
activity_correlation_id_t correlation_id; // activity ID
uint64_t begin_ns; // host begin timestamp
uint64_t end_ns; // host end timestamp
};
union {
struct {
int device_id; // device id
uint64_t queue_id; // queue id
};
struct {
uint32_t process_id; // device id
uint32_t thread_id; // thread id
};
struct {
activity_correlation_id_t external_id; // external correlatino id
};
struct {
uint32_t se; // sampled SE
uint64_t cycle; // sample cycle
uint64_t pc; // sample PC
} pc_sample;
};
union {
struct {
int device_id; // device id
uint64_t queue_id; // queue id
};
size_t bytes; // data size bytes
struct {
uint32_t process_id; // device id
uint32_t thread_id; // thread id
};
struct {
activity_correlation_id_t external_id; // external correlatino id
};
};
size_t bytes; // data size bytes
} activity_record_t;
// Activity sync calback type
typedef void* (*activity_sync_callback_t)(uint32_t cid, activity_record_t* record, const void* data, void* arg);
typedef void* (*activity_sync_callback_t)(uint32_t cid, activity_record_t* record, const void* data,
void* arg);
// Activity async calback type
typedef void (*activity_id_callback_t)(activity_correlation_id_t id);
typedef void (*activity_async_callback_t)(uint32_t op, void* record, void* arg);
+66 -81
Просмотреть файл
@@ -81,17 +81,15 @@ typedef activity_domain_t roctracer_domain_t;
// Return Op string by given domain and Op code
// NULL returned on the error and the library errno is set
const char* roctracer_op_string(
uint32_t domain, // tracing domain
uint32_t op, // activity op ID
uint32_t kind); // activity kind
const char* roctracer_op_string(uint32_t domain, // tracing domain
uint32_t op, // activity op ID
uint32_t kind); // activity kind
// Return Op code and kind by given string
roctracer_status_t roctracer_op_code(
uint32_t domain, // tracing domain
const char* str, // [in] op string
uint32_t* op, // [out] op code
uint32_t* kind); // [out] op kind code if not NULL
roctracer_status_t roctracer_op_code(uint32_t domain, // tracing domain
const char* str, // [in] op string
uint32_t* op, // [out] op code
uint32_t* kind); // [out] op kind code if not NULL
////////////////////////////////////////////////////////////////////////////////
// Callback API
@@ -106,24 +104,22 @@ typedef activity_rtapi_callback_t roctracer_rtapi_callback_t;
// Enable runtime API callbacks
roctracer_status_t roctracer_enable_op_callback(
activity_domain_t domain, // tracing domain
uint32_t op, // API call ID
activity_rtapi_callback_t callback, // callback function pointer
void* arg); // [in/out] callback arg
activity_domain_t domain, // tracing domain
uint32_t op, // API call ID
activity_rtapi_callback_t callback, // callback function pointer
void* arg); // [in/out] callback arg
roctracer_status_t roctracer_enable_domain_callback(
activity_domain_t domain, // tracing domain
activity_rtapi_callback_t callback, // callback function pointer
void* arg); // [in/out] callback arg
activity_domain_t domain, // tracing domain
activity_rtapi_callback_t callback, // callback function pointer
void* arg); // [in/out] callback arg
roctracer_status_t roctracer_enable_callback(
activity_rtapi_callback_t callback, // callback function pointer
void* arg); // [in/out] callback arg
activity_rtapi_callback_t callback, // callback function pointer
void* arg); // [in/out] callback arg
// Disable runtime API callbacks
roctracer_status_t roctracer_disable_op_callback(
activity_domain_t domain, // tracing domain
uint32_t op); // API call ID
roctracer_status_t roctracer_disable_domain_callback(
activity_domain_t domain); // tracing domain
roctracer_status_t roctracer_disable_op_callback(activity_domain_t domain, // tracing domain
uint32_t op); // API call ID
roctracer_status_t roctracer_disable_domain_callback(activity_domain_t domain); // tracing domain
roctracer_status_t roctracer_disable_callback();
////////////////////////////////////////////////////////////////////////////////
@@ -140,33 +136,32 @@ typedef activity_record_t roctracer_record_t;
// Return next record
static inline roctracer_status_t roctracer_next_record(
const activity_record_t* record, // [in] record ptr
const activity_record_t** next) // [out] next record ptr
const activity_record_t* record, // [in] record ptr
const activity_record_t** next) // [out] next record ptr
{
*next = record + 1;
return ROCTRACER_STATUS_SUCCESS;
}
// Tracer allocator type
typedef void (*roctracer_allocator_t)(
char** ptr, // memory pointer
size_t size, // memory size
void* arg); // allocator arg
typedef void (*roctracer_allocator_t)(char** ptr, // memory pointer
size_t size, // memory size
void* arg); // allocator arg
// Pool callback type
typedef void (*roctracer_buffer_callback_t)(
const char* begin, // [in] available buffered trace records
const char* end, // [in] end of buffered trace records
void* arg); // [in/out] callback arg
const char* begin, // [in] available buffered trace records
const char* end, // [in] end of buffered trace records
void* arg); // [in/out] callback arg
// Tracer properties
typedef struct {
uint32_t mode; // roctracer mode
size_t buffer_size; // buffer size
roctracer_allocator_t alloc_fun; // memory alocator function pointer
void* alloc_arg; // memory alocator function pointer
roctracer_buffer_callback_t buffer_callback_fun; // tracer record callback function
void* buffer_callback_arg; // tracer record callback arg
uint32_t mode; // roctracer mode
size_t buffer_size; // buffer size
roctracer_allocator_t alloc_fun; // memory alocator function pointer
void* alloc_arg; // memory alocator function pointer
roctracer_buffer_callback_t buffer_callback_fun; // tracer record callback function
void* buffer_callback_arg; // tracer record callback arg
} roctracer_properties_t;
// Tracer memory pool type
@@ -175,78 +170,69 @@ typedef void roctracer_pool_t;
// Create tracer memory pool
// The first invocation sets the default pool
roctracer_status_t roctracer_open_pool_expl(
const roctracer_properties_t* properties, // tracer pool properties
roctracer_pool_t** pool); // [out] returns tracer pool if not NULL,
// otherwise sets the default one if it is not set yet
const roctracer_properties_t* properties, // tracer pool properties
roctracer_pool_t** pool); // [out] returns tracer pool if not NULL,
// otherwise sets the default one if it is not set yet
static inline roctracer_status_t roctracer_open_pool(
const roctracer_properties_t* properties) // tracer pool properties
const roctracer_properties_t* properties) // tracer pool properties
{
return roctracer_open_pool_expl(properties, NULL);
return roctracer_open_pool_expl(properties, NULL);
}
// otherwise the error is generated
// otherwise the error is generated
// Close tracer memory pool
roctracer_status_t roctracer_close_pool_expl(
roctracer_pool_t* pool); // [in] memory pool, NULL is a default one
static inline roctracer_status_t roctracer_close_pool()
{
return roctracer_close_pool_expl(NULL);
}
roctracer_pool_t* pool); // [in] memory pool, NULL is a default one
static inline roctracer_status_t roctracer_close_pool() { return roctracer_close_pool_expl(NULL); }
// Return current default pool
// Set new default pool if the argument is not NULL
roctracer_pool_t* roctracer_default_pool_expl(
roctracer_pool_t* pool); // [in] new default pool if not NULL
static inline roctracer_pool_t* roctracer_default_pool()
{
return roctracer_default_pool_expl(NULL);
roctracer_pool_t* pool); // [in] new default pool if not NULL
static inline roctracer_pool_t* roctracer_default_pool() {
return roctracer_default_pool_expl(NULL);
}
// Enable activity records logging
roctracer_status_t roctracer_enable_op_activity_expl(
activity_domain_t domain, // tracing domain
uint32_t op, // activity op ID
roctracer_pool_t* pool); // memory pool, NULL is a default one
activity_domain_t domain, // tracing domain
uint32_t op, // activity op ID
roctracer_pool_t* pool); // memory pool, NULL is a default one
static inline roctracer_status_t roctracer_enable_op_activity(
activity_domain_t domain, // tracing domain
uint32_t op) // activity op ID
activity_domain_t domain, // tracing domain
uint32_t op) // activity op ID
{
return roctracer_enable_op_activity_expl(domain, op, NULL);
return roctracer_enable_op_activity_expl(domain, op, NULL);
}
roctracer_status_t roctracer_enable_domain_activity_expl(
activity_domain_t domain, // tracing domain
roctracer_pool_t* pool); // memory pool, NULL is a default one
activity_domain_t domain, // tracing domain
roctracer_pool_t* pool); // memory pool, NULL is a default one
static inline roctracer_status_t roctracer_enable_domain_activity(
activity_domain_t domain) // tracing domain
activity_domain_t domain) // tracing domain
{
return roctracer_enable_domain_activity_expl(domain, NULL);
return roctracer_enable_domain_activity_expl(domain, NULL);
}
roctracer_status_t roctracer_enable_activity_expl(
roctracer_pool_t* pool); // memory pool, NULL is a default one
static inline roctracer_status_t roctracer_enable_activity()
{
return roctracer_enable_activity_expl(NULL);
roctracer_pool_t* pool); // memory pool, NULL is a default one
static inline roctracer_status_t roctracer_enable_activity() {
return roctracer_enable_activity_expl(NULL);
}
// Disable activity records logging
roctracer_status_t roctracer_disable_op_activity(
activity_domain_t domain, // tracing domain
uint32_t op); // activity op ID
roctracer_status_t roctracer_disable_domain_activity(
activity_domain_t domain); // tracing domain
roctracer_status_t roctracer_disable_op_activity(activity_domain_t domain, // tracing domain
uint32_t op); // activity op ID
roctracer_status_t roctracer_disable_domain_activity(activity_domain_t domain); // tracing domain
roctracer_status_t roctracer_disable_activity();
// Flush available activity records
roctracer_status_t roctracer_flush_activity_expl(
roctracer_pool_t* pool); // memory pool, NULL is a default one
static inline roctracer_status_t roctracer_flush_activity()
{
return roctracer_flush_activity_expl(NULL);
roctracer_pool_t* pool); // memory pool, NULL is a default one
static inline roctracer_status_t roctracer_flush_activity() {
return roctracer_flush_activity_expl(NULL);
}
// Get system timestamp
roctracer_status_t roctracer_get_timestamp(
uint64_t* timestamp); // [out] return timestamp
roctracer_status_t roctracer_get_timestamp(uint64_t* timestamp); // [out] return timestamp
// Load/Unload methods
bool roctracer_load();
@@ -254,9 +240,8 @@ void roctracer_unload();
void roctracer_flush_buf();
// Set properties
roctracer_status_t roctracer_set_properties(
roctracer_domain_t domain, // tracing domain
void* propertes); // tracing properties
roctracer_status_t roctracer_set_properties(roctracer_domain_t domain, // tracing domain
void* propertes); // tracing properties
#ifdef __cplusplus
} // extern "C" block
+3 -4
Просмотреть файл
@@ -28,9 +28,8 @@
namespace roctracer {
// Generic callbacks table
template <int N>
class CbTable {
public:
template <int N> class CbTable {
public:
typedef std::mutex mutex_t;
CbTable() {
@@ -63,7 +62,7 @@ class CbTable {
return ret;
}
private:
private:
activity_rtapi_callback_t callback_[N];
void* arg_[N];
mutex_t mutex_;
+2 -1
Просмотреть файл
@@ -62,7 +62,8 @@ roctracer_status_t roctracer_activity_push_external_correlation_id(activity_corr
// Notifies that the calling thread is leaving an external API region.
// Pop an external correlation id for the calling thread.
// 'lastId' returns the last external correlation if not NULL
roctracer_status_t roctracer_activity_pop_external_correlation_id(activity_correlation_id_t* last_id);
roctracer_status_t roctracer_activity_pop_external_correlation_id(
activity_correlation_id_t* last_id);
#ifdef __cplusplus
} // extern "C" block
+4 -9
Просмотреть файл
@@ -21,19 +21,14 @@
#ifndef INC_ROCTRACER_HCC_H_
#define INC_ROCTRACER_HCC_H_
enum {
HIP_OP_ID_DISPATCH = 0,
HIP_OP_ID_COPY = 1,
HIP_OP_ID_BARRIER = 2,
HIP_OP_ID_NUMBER = 3
};
enum { HIP_OP_ID_DISPATCH = 0, HIP_OP_ID_COPY = 1, HIP_OP_ID_BARRIER = 2, HIP_OP_ID_NUMBER = 3 };
#ifdef __cplusplus
extern "C" {
#endif
typedef void (hipInitAsyncActivityCallback_t)(void* id_callback, void* op_callback, void* arg);
typedef bool (hipEnableAsyncActivityCallback_t)(unsigned op, bool enable);
typedef const char* (hipGetOpName_t)(unsigned op);
typedef void(hipInitAsyncActivityCallback_t)(void* id_callback, void* op_callback, void* arg);
typedef bool(hipEnableAsyncActivityCallback_t)(unsigned op, bool enable);
typedef const char*(hipGetOpName_t)(unsigned op);
#ifdef __cplusplus
}
#endif
+2 -2
Просмотреть файл
@@ -25,12 +25,12 @@
#include <iostream>
inline static std::ostream& operator<<(std::ostream& out, const unsigned char& v) {
out << (unsigned int)v;
out << (unsigned int)v;
return out;
}
inline static std::ostream& operator<<(std::ostream& out, const char& v) {
out << (unsigned char)v;
out << (unsigned char)v;
return out;
}
#endif // __cplusplus
+6 -8
Просмотреть файл
@@ -42,9 +42,7 @@ enum hsa_op_id_t {
namespace roctracer {
namespace hsa_support {
enum {
HSA_OP_ID_async_copy = 0
};
enum { HSA_OP_ID_async_copy = 0 };
extern CoreApiTable CoreApiTable_saved;
extern AmdExtTable AmdExtTable_saved;
@@ -57,18 +55,18 @@ struct ops_properties_t {
const char* output_prefix;
};
}; // namespace hsa_support
}; // namespace hsa_support
typedef hsa_support::ops_properties_t hsa_ops_properties_t;
}; // namespace roctracer
}; // namespace roctracer
#include "hsa_ostream_ops.h"
#else // !__cplusplus
#else // !__cplusplus
typedef void* hsa_amd_queue_intercept_handler;
typedef void* hsa_amd_runtime_queue_notifier;
#endif //! __cplusplus
#endif //! __cplusplus
#include <hsa_prof_str.h>
#endif // INC_ROCTRACER_HSA_H_
#endif // INC_ROCTRACER_HSA_H_
+2 -2
Просмотреть файл
@@ -47,8 +47,8 @@ enum roctx_api_id_t {
typedef struct roctx_api_data_s {
union {
struct {
const char* message;
roctx_range_id_t id;
const char* message;
roctx_range_id_t id;
};
struct {
const char* message;
+2 -2
Просмотреть файл
@@ -59,8 +59,8 @@ void roctxMarkA(const char* message);
////////////////////////////////////////////////////////////////////////////////
// Ranges annotating API
// Returns the 0 based level of a nested range being started by given message associated to this range.
// A negative value is returned on the error.
// Returns the 0 based level of a nested range being started by given message associated to this
// range. A negative value is returned on the error.
int roctxRangePushA(const char* message);
#define roctxRangePush(message) roctxRangePushA(message)
+1 -1
Просмотреть файл
@@ -27,7 +27,7 @@ namespace roctracer {
enum { API_CB_MASK = 0x1, ACT_CB_MASK = 0x2 };
class hip_act_cb_tracker_t {
private:
private:
std::map<uint32_t, uint32_t> data;
public:
+73 -63
Просмотреть файл
@@ -25,29 +25,29 @@
#include <mutex>
#include <dlfcn.h>
#define ONLD_TRACE(str) \
if (getenv("ROCP_ONLOAD_TRACE")) do { \
std::cout << "PID(" << GetPid() << "): TRACER_LOADER::" << __FUNCTION__ << " " << str << std::endl << std::flush; \
} while(0);
#define ONLD_TRACE(str) \
if (getenv("ROCP_ONLOAD_TRACE")) do { \
std::cout << "PID(" << GetPid() << "): TRACER_LOADER::" << __FUNCTION__ << " " << str \
<< std::endl \
<< std::flush; \
} while (0);
namespace roctracer {
// Base runtime loader class
template <class T>
class BaseLoader : public T {
template <class T> class BaseLoader : public T {
static uint32_t GetPid() { return syscall(__NR_getpid); }
public:
public:
typedef std::mutex mutex_t;
typedef BaseLoader<T> loader_t;
bool Enabled() const { return (handle_ != NULL); }
template <class fun_t>
fun_t* GetFun(const char* fun_name) {
template <class fun_t> fun_t* GetFun(const char* fun_name) {
if (handle_ == NULL) return NULL;
fun_t *f = (fun_t*) dlsym(handle_, fun_name);
fun_t* f = (fun_t*)dlsym(handle_, fun_name);
if ((to_check_symb_ == true) && (f == NULL)) {
fprintf(stderr, "roctracer: symbol lookup '%s' failed: \"%s\"\n", fun_name, dlerror());
abort();
@@ -68,11 +68,11 @@ class BaseLoader : public T {
}
static loader_t* GetRef() { return instance_; }
static void SetLibName(const char *name) { lib_name_ = name; }
static void SetLibName(const char* name) { lib_name_ = name; }
private:
private:
BaseLoader() {
const int flags = (to_load_ == true) ? RTLD_LAZY : RTLD_LAZY|RTLD_NOLOAD;
const int flags = (to_load_ == true) ? RTLD_LAZY : RTLD_LAZY | RTLD_NOLOAD;
handle_ = dlopen(lib_name_, flags);
ONLD_TRACE("(" << lib_name_ << " = " << handle_ << ")");
if ((to_check_open_ == true) && (handle_ == NULL)) {
@@ -99,14 +99,14 @@ class BaseLoader : public T {
// 'rocprofiler' library loader class
class RocpApi {
public:
public:
typedef BaseLoader<RocpApi> Loader;
typedef bool (RegisterCallback_t)(uint32_t op, void* callback, void* arg);
typedef bool (OperateCallback_t)(uint32_t op);
typedef bool (InitCallback_t)(void* callback, void* arg);
typedef bool (EnableCallback_t)(uint32_t op, bool enable);
typedef const char* (NameCallback_t)(uint32_t op);
typedef bool(RegisterCallback_t)(uint32_t op, void* callback, void* arg);
typedef bool(OperateCallback_t)(uint32_t op);
typedef bool(InitCallback_t)(void* callback, void* arg);
typedef bool(EnableCallback_t)(uint32_t op, bool enable);
typedef const char*(NameCallback_t)(uint32_t op);
RegisterCallback_t* RegisterApiCallback;
OperateCallback_t* RemoveApiCallback;
@@ -118,7 +118,7 @@ class RocpApi {
OperateCallback_t* RemoveEvtCallback;
NameCallback_t* GetEvtName;
protected:
protected:
void init(Loader* loader) {
RegisterApiCallback = loader->GetFun<RegisterCallback_t>("RegisterApiCallback");
RemoveApiCallback = loader->GetFun<OperateCallback_t>("RemoveApiCallback");
@@ -134,29 +134,36 @@ class RocpApi {
// HIP runtime library loader class
#if STATIC_BUILD
__attribute__((weak)) hipError_t hipRegisterApiCallback(uint32_t id, void* fun, void* arg) { return hipErrorUnknown; }
__attribute__((weak)) hipError_t hipRegisterApiCallback(uint32_t id, void* fun, void* arg) {
return hipErrorUnknown;
}
__attribute__((weak)) hipError_t hipRemoveApiCallback(uint32_t id) { return hipErrorUnknown; }
__attribute__((weak)) hipError_t hipRegisterActivityCallback(uint32_t id, void* fun, void* arg) { return hipErrorUnknown; }
__attribute__((weak)) hipError_t hipRegisterActivityCallback(uint32_t id, void* fun, void* arg) {
return hipErrorUnknown;
}
__attribute__((weak)) hipError_t hipRemoveActivityCallback(uint32_t id) { return hipErrorUnknown; }
__attribute__((weak)) const char* hipKernelNameRef(const hipFunction_t f) { return NULL; }
__attribute__((weak)) const char* hipKernelNameRefByPtr(const void* hostFunction, hipStream_t stream) { return NULL; }
__attribute__((weak)) const char* hipKernelNameRefByPtr(const void* hostFunction,
hipStream_t stream) {
return NULL;
}
__attribute__((weak)) int hipGetStreamDeviceId(hipStream_t stream) { return 0; }
__attribute__((weak)) const char* hipApiName(uint32_t id) { return NULL; }
class HipLoaderStatic {
public:
public:
typedef std::mutex mutex_t;
typedef HipLoaderStatic loader_t;
typedef std::atomic<loader_t*> instance_t;
typedef hipError_t (RegisterApiCallback_t)(uint32_t id, void* fun, void* arg);
typedef hipError_t (RemoveApiCallback_t)(uint32_t id);
typedef hipError_t (RegisterActivityCallback_t)(uint32_t id, void* fun, void* arg);
typedef hipError_t (RemoveActivityCallback_t)(uint32_t id);
typedef const char* (KernelNameRef_t)(const hipFunction_t f);
typedef const char* (KernelNameRefByPtr_t)(const void* hostFunction, hipStream_t stream);
typedef int (GetStreamDeviceId_t)(hipStream_t stream);
typedef const char* (ApiName_t)(uint32_t id);
typedef hipError_t(RegisterApiCallback_t)(uint32_t id, void* fun, void* arg);
typedef hipError_t(RemoveApiCallback_t)(uint32_t id);
typedef hipError_t(RegisterActivityCallback_t)(uint32_t id, void* fun, void* arg);
typedef hipError_t(RemoveActivityCallback_t)(uint32_t id);
typedef const char*(KernelNameRef_t)(const hipFunction_t f);
typedef const char*(KernelNameRefByPtr_t)(const void* hostFunction, hipStream_t stream);
typedef int(GetStreamDeviceId_t)(hipStream_t stream);
typedef const char*(ApiName_t)(uint32_t id);
RegisterApiCallback_t* RegisterApiCallback;
RemoveApiCallback_t* RemoveApiCallback;
@@ -181,7 +188,7 @@ class HipLoaderStatic {
bool Enabled() const { return true; }
private:
private:
HipLoaderStatic() {
RegisterApiCallback = hipRegisterApiCallback;
RemoveApiCallback = hipRemoveApiCallback;
@@ -189,7 +196,7 @@ class HipLoaderStatic {
RemoveActivityCallback = hipRemoveActivityCallback;
KernelNameRef = hipKernelNameRef;
KernelNameRefByPtr = hipKernelNameRefByPtr;
GetStreamDeviceId = hipGetStreamDeviceId;
GetStreamDeviceId = hipGetStreamDeviceId;
ApiName = hipApiName;
}
@@ -198,7 +205,7 @@ class HipLoaderStatic {
};
#else
class HipApi {
public:
public:
typedef BaseLoader<HipApi> Loader;
typedef decltype(hipRegisterApiCallback) RegisterApiCallback_t;
@@ -219,15 +226,16 @@ class HipApi {
GetStreamDeviceId_t* GetStreamDeviceId;
ApiName_t* ApiName;
protected:
protected:
void init(Loader* loader) {
RegisterApiCallback = loader->GetFun<RegisterApiCallback_t>("hipRegisterApiCallback");
RemoveApiCallback = loader->GetFun<RemoveApiCallback_t>("hipRemoveApiCallback");
RegisterActivityCallback = loader->GetFun<RegisterActivityCallback_t>("hipRegisterActivityCallback");
RegisterActivityCallback =
loader->GetFun<RegisterActivityCallback_t>("hipRegisterActivityCallback");
RemoveActivityCallback = loader->GetFun<RemoveActivityCallback_t>("hipRemoveActivityCallback");
KernelNameRef = loader->GetFun<KernelNameRef_t>("hipKernelNameRef");
KernelNameRefByPtr = loader->GetFun<KernelNameRefByPtr_t>("hipKernelNameRefByPtr");
GetStreamDeviceId = loader->GetFun<GetStreamDeviceId_t>("hipGetStreamDeviceId");
GetStreamDeviceId = loader->GetFun<GetStreamDeviceId_t>("hipGetStreamDeviceId");
ApiName = loader->GetFun<ApiName_t>("hipApiName");
}
};
@@ -236,17 +244,19 @@ class HipApi {
// HCC runtime library loader class
#include "inc/roctracer_hcc.h"
class HccApi {
public:
public:
typedef BaseLoader<HccApi> Loader;
hipInitAsyncActivityCallback_t* InitActivityCallback;
hipEnableAsyncActivityCallback_t* EnableActivityCallback;
hipGetOpName_t* GetOpName;
protected:
protected:
void init(Loader* loader) {
InitActivityCallback = loader->GetFun<hipInitAsyncActivityCallback_t>("hipInitActivityCallback");
EnableActivityCallback = loader->GetFun<hipEnableAsyncActivityCallback_t>("hipEnableActivityCallback");
InitActivityCallback =
loader->GetFun<hipInitAsyncActivityCallback_t>("hipInitActivityCallback");
EnableActivityCallback =
loader->GetFun<hipEnableAsyncActivityCallback_t>("hipEnableActivityCallback");
GetOpName = loader->GetFun<hipGetOpName_t>("hipGetCmdName");
}
};
@@ -254,7 +264,7 @@ class HccApi {
// rocTX runtime library loader class
#include "inc/roctracer_roctx.h"
class RocTxApi {
public:
public:
typedef BaseLoader<RocTxApi> Loader;
typedef decltype(RegisterApiCallback) RegisterApiCallback_t;
@@ -265,7 +275,7 @@ class RocTxApi {
RemoveApiCallback_t* RemoveApiCallback;
RangeStackIterate_t* RangeStackIterate;
protected:
protected:
void init(Loader* loader) {
RegisterApiCallback = loader->GetFun<RegisterApiCallback_t>("RegisterApiCallback");
RemoveApiCallback = loader->GetFun<RemoveApiCallback_t>("RemoveApiCallback");
@@ -284,31 +294,31 @@ typedef BaseLoader<HipApi> HipLoaderShared;
typedef HipLoaderShared HipLoader;
#endif
} // namespace roctracer
} // namespace roctracer
#define LOADER_INSTANTIATE_2() \
template<class T> typename roctracer::BaseLoader<T>::mutex_t roctracer::BaseLoader<T>::mutex_; \
template<class T> std::atomic<roctracer::BaseLoader<T>*> roctracer::BaseLoader<T>::instance_{}; \
template<class T> bool roctracer::BaseLoader<T>::to_load_ = false; \
template<class T> bool roctracer::BaseLoader<T>::to_check_open_ = true; \
template<class T> bool roctracer::BaseLoader<T>::to_check_symb_ = true; \
template<> const char* roctracer::RocpLoader::lib_name_ = "librocprofiler64.so"; \
template<> bool roctracer::RocpLoader::to_load_ = true; \
template<> const char* roctracer::HccLoader::lib_name_ = "libamdhip64.so"; \
template<> const char* roctracer::RocTxLoader::lib_name_ = "libroctx64.so"; \
template<> bool roctracer::RocTxLoader::to_load_ = true;
#define LOADER_INSTANTIATE_2() \
template <class T> typename roctracer::BaseLoader<T>::mutex_t roctracer::BaseLoader<T>::mutex_; \
template <class T> std::atomic<roctracer::BaseLoader<T>*> roctracer::BaseLoader<T>::instance_{}; \
template <class T> bool roctracer::BaseLoader<T>::to_load_ = false; \
template <class T> bool roctracer::BaseLoader<T>::to_check_open_ = true; \
template <class T> bool roctracer::BaseLoader<T>::to_check_symb_ = true; \
template <> const char* roctracer::RocpLoader::lib_name_ = "librocprofiler64.so"; \
template <> bool roctracer::RocpLoader::to_load_ = true; \
template <> const char* roctracer::HccLoader::lib_name_ = "libamdhip64.so"; \
template <> const char* roctracer::RocTxLoader::lib_name_ = "libroctx64.so"; \
template <> bool roctracer::RocTxLoader::to_load_ = true;
#if STATIC_BUILD
#define LOADER_INSTANTIATE_HIP() \
roctracer::HipLoaderStatic::mutex_t roctracer::HipLoaderStatic::mutex_; \
#define LOADER_INSTANTIATE_HIP() \
roctracer::HipLoaderStatic::mutex_t roctracer::HipLoaderStatic::mutex_; \
roctracer::HipLoaderStatic::instance_t roctracer::HipLoaderStatic::instance_{};
#else
#define LOADER_INSTANTIATE_HIP() \
template<> const char* roctracer::HipLoaderShared::lib_name_ = "libamdhip64.so";
#define LOADER_INSTANTIATE_HIP() \
template <> const char* roctracer::HipLoaderShared::lib_name_ = "libamdhip64.so";
#endif
#define LOADER_INSTANTIATE() \
LOADER_INSTANTIATE_2(); \
#define LOADER_INSTANTIATE() \
LOADER_INSTANTIATE_2(); \
LOADER_INSTANTIATE_HIP();
#endif // SRC_CORE_LOADER_H_
#endif // SRC_CORE_LOADER_H_
+15 -15
Просмотреть файл
@@ -42,7 +42,7 @@
namespace roctracer {
class MemoryPool {
public:
public:
typedef std::mutex mutex_t;
static void allocator_default(char** ptr, size_t size, void* arg) {
@@ -52,12 +52,12 @@ class MemoryPool {
} else if (size != 0) {
*ptr = reinterpret_cast<char*>(realloc(*ptr, size));
} else {
free(*ptr);
free(*ptr);
*ptr = NULL;
}
}
MemoryPool(const roctracer_properties_t& properties) {
MemoryPool(const roctracer_properties_t& properties) {
// Assigning pool allocator
alloc_fun_ = allocator_default;
alloc_arg_ = NULL;
@@ -89,14 +89,14 @@ class MemoryPool {
~MemoryPool() {
Flush();
PTHREAD_CALL(pthread_cancel(consumer_thread_));
void *res;
void* res;
PTHREAD_CALL(pthread_join(consumer_thread_, &res));
if (res != PTHREAD_CANCELED) EXC_ABORT(ROCTRACER_STATUS_ERROR, "consumer thread wasn't stopped correctly");
if (res != PTHREAD_CANCELED)
EXC_ABORT(ROCTRACER_STATUS_ERROR, "consumer thread wasn't stopped correctly");
allocator_default(&pool_begin_, 0, alloc_arg_);
}
template <typename Record>
void Write(const Record& record) {
template <typename Record> void Write(const Record& record) {
std::lock_guard<mutex_t> lock(write_mutex_);
getRecord<Record>(record);
}
@@ -112,7 +112,7 @@ class MemoryPool {
}
}
private:
private:
struct consumer_arg_t {
MemoryPool* obj;
const char* begin;
@@ -126,11 +126,13 @@ class MemoryPool {
}
};
template <typename Record>
Record* getRecord(const Record& init) {
template <typename Record> Record* getRecord(const Record& init) {
char* next = write_ptr_ + sizeof(Record);
if (next > buffer_end_) {
if (write_ptr_ == buffer_begin_) EXC_ABORT(ROCTRACER_STATUS_ERROR, "buffer size(" << buffer_size_ << ") is less then the record(" << sizeof(Record) << ")");
if (write_ptr_ == buffer_begin_)
EXC_ABORT(ROCTRACER_STATUS_ERROR,
"buffer size(" << buffer_size_ << ") is less then the record(" << sizeof(Record)
<< ")");
spawn_reader(buffer_begin_, write_ptr_);
buffer_begin_ = (buffer_end_ == pool_end_) ? pool_begin_ : buffer_end_;
buffer_end_ = buffer_begin_ + buffer_size_;
@@ -145,12 +147,10 @@ class MemoryPool {
return ptr;
}
static void reset_reader(consumer_arg_t* arg) {
arg->valid.store(false);
}
static void reset_reader(consumer_arg_t* arg) { arg->valid.store(false); }
static void sync_reader(const consumer_arg_t* arg) {
while(arg->valid.load() == true) PTHREAD_CALL(sched_yield());
while (arg->valid.load() == true) PTHREAD_CALL(sched_yield());
}
static void* reader_fun(void* consumer_arg) {
+267 -258
Просмотреть файл
@@ -66,7 +66,6 @@
#define API_METHOD_PREFIX \
roctracer_status_t err = ROCTRACER_STATUS_SUCCESS; \
try {
#define API_METHOD_SUFFIX \
} \
catch (std::exception & e) { \
@@ -83,10 +82,12 @@
(void)err; \
return X;
#define ONLOAD_TRACE(str) \
if (getenv("ROCP_ONLOAD_TRACE")) do { \
std::cout << "PID(" << GetPid() << "): TRACER_LIB::" << __FUNCTION__ << " " << str << std::endl << std::flush; \
} while(0);
#define ONLOAD_TRACE(str) \
if (getenv("ROCP_ONLOAD_TRACE")) do { \
std::cout << "PID(" << GetPid() << "): TRACER_LIB::" << __FUNCTION__ << " " << str \
<< std::endl \
<< std::flush; \
} while (0);
#define ONLOAD_TRACE_BEG() ONLOAD_TRACE("begin")
#define ONLOAD_TRACE_END() ONLOAD_TRACE("end")
@@ -95,7 +96,8 @@ static inline uint32_t GetPid() { return syscall(__NR_getpid); }
///////////////////////////////////////////////////////////////////////////////////////////////////
// Mark callback
//
typedef void (mark_api_callback_t)(uint32_t domain, uint32_t cid, const void* callback_data, void* arg);
typedef void(mark_api_callback_t)(uint32_t domain, uint32_t cid, const void* callback_data,
void* arg);
mark_api_callback_t* mark_api_callback_ptr = NULL;
///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -195,7 +197,8 @@ ActJournal* act_journal;
template <typename Functor> struct journal_functor_t {
Functor func_;
journal_functor_t(Functor&& f) : func_(std::forward<Functor>(f)) {}
template <typename Data> bool operator ()(activity_domain_t domain, uint32_t op, Data&& /* data */) const {
template <typename Data>
bool operator()(activity_domain_t domain, uint32_t op, Data&& /* data */) const {
func_(domain, op);
return true;
}
@@ -208,13 +211,13 @@ using act_dis_functor_t = journal_functor_t<roctracer_disable_op_activity_t>;
template <>
template <typename Data>
bool cb_en_functor_t::operator ()(activity_domain_t domain, uint32_t op, Data&& data) const {
bool cb_en_functor_t::operator()(activity_domain_t domain, uint32_t op, Data&& data) const {
func_(domain, op, data.callback, data.user_data);
return true;
}
template <>
template <typename Data>
bool act_en_functor_t::operator ()(activity_domain_t domain, uint32_t op, Data&& data) const {
bool act_en_functor_t::operator()(activity_domain_t domain, uint32_t op, Data&& data) const {
func_(domain, op, data.pool);
return true;
}
@@ -222,9 +225,7 @@ bool act_en_functor_t::operator ()(activity_domain_t domain, uint32_t op, Data&&
void hsa_async_copy_handler(::proxy::Tracker::entry_t* entry);
void hsa_kernel_handler(::proxy::Tracker::entry_t* entry);
constexpr TraceBuffer<trace_entry_t>::flush_prm_t trace_buffer_prm[] = {
{COPY_ENTRY_TYPE, hsa_async_copy_handler},
{KERNEL_ENTRY_TYPE, hsa_kernel_handler}
};
{COPY_ENTRY_TYPE, hsa_async_copy_handler}, {KERNEL_ENTRY_TYPE, hsa_kernel_handler}};
TraceBuffer<trace_entry_t>* trace_buffer = NULL;
namespace hsa_support {
@@ -250,18 +251,19 @@ roctracer_stop_cb_t roctracer_stop_cb = NULL;
roctracer_status_t GetExcStatus(const std::exception& e) {
const util::exception* roctracer_exc_ptr = dynamic_cast<const util::exception*>(&e);
return (roctracer_exc_ptr) ? static_cast<roctracer_status_t>(roctracer_exc_ptr->status()) : ROCTRACER_STATUS_ERROR;
return (roctracer_exc_ptr) ? static_cast<roctracer_status_t>(roctracer_exc_ptr->status())
: ROCTRACER_STATUS_ERROR;
}
class GlobalCounter {
public:
public:
typedef std::mutex mutex_t;
typedef uint64_t counter_t;
typedef std::atomic<counter_t> atomic_counter_t;
static counter_t Increment() { return counter_.fetch_add(1, std::memory_order_relaxed); }
private:
private:
static mutex_t mutex_;
static atomic_counter_t counter_;
};
@@ -273,12 +275,12 @@ struct roctracer_api_data_t {
union {
hip_api_data_t hip;
};
roctracer_api_data_t() {};
roctracer_api_data_t(){};
};
struct record_pair_t {
roctracer_record_t record;
roctracer_api_data_t data;
record_pair_t() {};
record_pair_t(){};
};
typedef std::stack<record_pair_t> record_pair_stack_t;
static thread_local record_pair_stack_t* record_pair_stack = NULL;
@@ -295,15 +297,18 @@ static thread_local std::stack<activity_correlation_id_t> external_id_stack;
static inline void CorrelationIdRegistr(const activity_correlation_id_t& correlation_id) {
std::lock_guard<correlation_id_mutex_t> lck(correlation_id_mutex);
const auto ret = correlation_id_map.insert({correlation_id, correlation_id_tls});
if (ret.second == false) EXC_ABORT(ROCTRACER_STATUS_ERROR, "HCC activity id is not unique(" << correlation_id << ")");
if (ret.second == false)
EXC_ABORT(ROCTRACER_STATUS_ERROR, "HCC activity id is not unique(" << correlation_id << ")");
DEBUG_TRACE("CorrelationIdRegistr id(%lu) id_tls(%lu)\n", correlation_id, correlation_id_tls);
}
static inline activity_correlation_id_t CorrelationIdLookup(const activity_correlation_id_t& correlation_id) {
static inline activity_correlation_id_t CorrelationIdLookup(
const activity_correlation_id_t& correlation_id) {
std::lock_guard<correlation_id_mutex_t> lck(correlation_id_mutex);
auto it = correlation_id_map.find(correlation_id);
if (it == correlation_id_map.end()) EXC_ABORT(ROCTRACER_STATUS_ERROR, "HCC activity id lookup failed(" << correlation_id << ")");
if (it == correlation_id_map.end())
EXC_ABORT(ROCTRACER_STATUS_ERROR, "HCC activity id lookup failed(" << correlation_id << ")");
const activity_correlation_id_t ret_val = it->second;
correlation_id_map.erase(it);
@@ -320,14 +325,14 @@ hip_act_cb_tracker_t* hip_act_cb_tracker = NULL;
inline uint32_t HipApiActivityEnableCheck(uint32_t op) {
if (hip_act_cb_tracker == NULL) EXC_ABORT(ROCTRACER_STATUS_ERROR, "hip_act_cb_tracker is NULL");
const uint32_t mask = hip_act_cb_tracker->enable_check(op, API_CB_MASK);
const uint32_t ret = (mask & ACT_CB_MASK);
const uint32_t ret = (mask & ACT_CB_MASK);
return ret;
}
inline uint32_t HipApiActivityDisableCheck(uint32_t op) {
if (hip_act_cb_tracker == NULL) EXC_ABORT(ROCTRACER_STATUS_ERROR, "hip_act_cb_tracker is NULL");
const uint32_t mask = hip_act_cb_tracker->disable_check(op, API_CB_MASK);
const uint32_t ret = (mask & ACT_CB_MASK);
const uint32_t ret = (mask & ACT_CB_MASK);
return ret;
}
@@ -340,16 +345,12 @@ inline uint32_t HipActActivityEnableCheck(uint32_t op) {
inline uint32_t HipActActivityDisableCheck(uint32_t op) {
if (hip_act_cb_tracker == NULL) EXC_ABORT(ROCTRACER_STATUS_ERROR, "hip_act_cb_tracker is NULL");
const uint32_t mask = hip_act_cb_tracker->disable_check(op, ACT_CB_MASK);
const uint32_t ret = (mask & API_CB_MASK);
const uint32_t ret = (mask & API_CB_MASK);
return ret;
}
void* HIP_SyncApiDataCallback(
uint32_t op_id,
roctracer_record_t* record,
const void* callback_data,
void* arg)
{
void* HIP_SyncApiDataCallback(uint32_t op_id, roctracer_record_t* record, const void* callback_data,
void* arg) {
static hsa_rt_utils::Timer timer;
if (record_pair_stack == NULL) record_pair_stack = new record_pair_stack_t;
@@ -397,19 +398,18 @@ void* HIP_SyncApiDataCallback(
correlation_id_tls = 0;
}
const char * name = roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, op_id, 0);
DEBUG_TRACE("HIP_SyncApiDataCallback(\"%s\") phase(%d): op(%u) record(%p) data(%p) pool(%p) depth(%d) correlation_id(%lu) time_ns(%lu)\n",
name, phase, op_id, record, data, pool, (int)(record_pair_stack->size()), (data_ptr) ? data_ptr->correlation_id : 0, timer.timestamp_ns());
const char* name = roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, op_id, 0);
DEBUG_TRACE(
"HIP_SyncApiDataCallback(\"%s\") phase(%d): op(%u) record(%p) data(%p) pool(%p) depth(%d) "
"correlation_id(%lu) time_ns(%lu)\n",
name, phase, op_id, record, data, pool, (int)(record_pair_stack->size()),
(data_ptr) ? data_ptr->correlation_id : 0, timer.timestamp_ns());
return ret;
}
void* HIP_SyncActivityCallback(
uint32_t op_id,
roctracer_record_t* record,
const void* callback_data,
void* arg)
{
void* HIP_SyncActivityCallback(uint32_t op_id, roctracer_record_t* record,
const void* callback_data, void* arg) {
static hsa_rt_utils::Timer timer;
const timestamp_t timestamp_ns = timer.timestamp_ns();
if (record_pair_stack == NULL) record_pair_stack = new record_pair_stack_t;
@@ -462,8 +462,9 @@ void* HIP_SyncActivityCallback(
// Getting record of stacked
if (record == NULL) {
if (record_pair_stack->empty()) EXC_ABORT(ROCTRACER_STATUS_ERROR, "ActivityCallback exit: record stack is empty");
auto& top = record_pair_stack->top();
if (record_pair_stack->empty())
EXC_ABORT(ROCTRACER_STATUS_ERROR, "ActivityCallback exit: record stack is empty");
auto& top = record_pair_stack->top();
record = &(top.record);
}
@@ -491,9 +492,12 @@ void* HIP_SyncActivityCallback(
correlation_id_tls = 0;
}
const char * name = roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, op_id, 0);
DEBUG_TRACE("HIP_SyncActivityCallback(\"%s\") phase(%d): op(%u) record(%p) data(%p) pool(%p) depth(%d) correlation_id(%lu) beg_ns(%lu) end_ns(%lu)\n",
name, phase, op_id, record, data, pool, (int)(record_pair_stack->size()), (data_ptr) ? data_ptr->correlation_id : 0, timestamp_ns);
const char* name = roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, op_id, 0);
DEBUG_TRACE(
"HIP_SyncActivityCallback(\"%s\") phase(%d): op(%u) record(%p) data(%p) pool(%p) depth(%d) "
"correlation_id(%lu) beg_ns(%lu) end_ns(%lu)\n",
name, phase, op_id, record, data, pool, (int)(record_pair_stack->size()),
(data_ptr) ? data_ptr->correlation_id : 0, timestamp_ns);
return ret;
}
@@ -510,9 +514,12 @@ void HCC_AsyncActivityCallback(uint32_t op_id, void* record, void* arg) {
if (record_ptr->correlation_id == 0) return;
pool->Write(*record_ptr);
const char * name = roctracer_op_string(ACTIVITY_DOMAIN_HCC_OPS, record_ptr->op, record_ptr->kind);
DEBUG_TRACE("HCC_AsyncActivityCallback(\"%s\"): op(%u) kind(%u) record(%p) pool(%p) correlation_id(%d) beg_ns(%lu) end_ns(%lu)\n",
name, record_ptr->op, record_ptr->kind, record, pool, record_ptr->correlation_id, record_ptr->begin_ns, record_ptr->end_ns);
const char* name = roctracer_op_string(ACTIVITY_DOMAIN_HCC_OPS, record_ptr->op, record_ptr->kind);
DEBUG_TRACE(
"HCC_AsyncActivityCallback(\"%s\"): op(%u) kind(%u) record(%p) pool(%p) correlation_id(%d) "
"beg_ns(%lu) end_ns(%lu)\n",
name, record_ptr->op, record_ptr->kind, record, pool, record_ptr->correlation_id,
record_ptr->begin_ns, record_ptr->end_ns);
}
// Open output file
@@ -528,7 +535,8 @@ FILE* open_output_file(const char* prefix, const char* name) {
perror(errmsg.str().c_str());
abort();
}
} else file_handle = stdout;
} else
file_handle = stdout;
return file_handle;
}
@@ -542,16 +550,11 @@ void hsa_kernel_handler(::proxy::Tracker::entry_t* entry) {
if (index == 0) {
kernel_file_handle = open_output_file(hsa_support::output_prefix, "results.txt");
}
fprintf(kernel_file_handle, "dispatch[%lu], gpu-id(%u), tid(%u), kernel-name(\"%s\"), time(%lu,%lu,%lu,%lu)\n",
index,
//::util::HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index,
entry->dev_index,
entry->kernel.tid,
entry->kernel.name,
entry->dispatch,
entry->begin,
entry->end,
entry->complete);
fprintf(kernel_file_handle,
"dispatch[%lu], gpu-id(%u), tid(%u), kernel-name(\"%s\"), time(%lu,%lu,%lu,%lu)\n", index,
//::util::HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index,
entry->dev_index, entry->kernel.tid, entry->kernel.name, entry->dispatch, entry->begin,
entry->end, entry->complete);
#if 0
fprintf(file_handle, "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), tid(%lu), grd(%u), wgr(%u), lds(%u), scr(%u), vgpr(%u), sgpr(%u), fbar(%u), sig(0x%lx), kernel-name(\"%s\")",
index,
@@ -581,32 +584,29 @@ void hsa_kernel_handler(::proxy::Tracker::entry_t* entry) {
void hsa_async_copy_handler(::proxy::Tracker::entry_t* entry) {
activity_record_t record{};
record.domain = ACTIVITY_DOMAIN_HSA_OPS; // activity domain id
record.begin_ns = entry->begin; // host begin timestamp
record.end_ns = entry->end; // host end timestamp
record.device_id = 0; // device id
record.domain = ACTIVITY_DOMAIN_HSA_OPS; // activity domain id
record.begin_ns = entry->begin; // host begin timestamp
record.end_ns = entry->end; // host end timestamp
record.device_id = 0; // device id
hsa_support::async_copy_callback_fun(hsa_support::HSA_OP_ID_async_copy, &record, hsa_support::async_copy_callback_arg);
hsa_support::async_copy_callback_fun(hsa_support::HSA_OP_ID_async_copy, &record,
hsa_support::async_copy_callback_arg);
}
hsa_status_t hsa_amd_memory_async_copy_interceptor(
void* dst, hsa_agent_t dst_agent, const void* src,
hsa_agent_t src_agent, size_t size, uint32_t num_dep_signals,
const hsa_signal_t* dep_signals, hsa_signal_t completion_signal)
{
hsa_status_t hsa_amd_memory_async_copy_interceptor(void* dst, hsa_agent_t dst_agent,
const void* src, hsa_agent_t src_agent,
size_t size, uint32_t num_dep_signals,
const hsa_signal_t* dep_signals,
hsa_signal_t completion_signal) {
hsa_status_t status = HSA_STATUS_SUCCESS;
if (hsa_support::async_copy_callback_enabled) {
trace_entry_t* entry = trace_buffer->GetEntry();
::proxy::Tracker::Enable(COPY_ENTRY_TYPE, hsa_agent_t{}, completion_signal, entry);
status = hsa_amd_memory_async_copy_fn(dst, dst_agent, src,
src_agent, size, num_dep_signals,
status = hsa_amd_memory_async_copy_fn(dst, dst_agent, src, src_agent, size, num_dep_signals,
dep_signals, entry->signal);
if (status != HSA_STATUS_SUCCESS) ::proxy::Tracker::Disable(entry);
}
else
{
status = hsa_amd_memory_async_copy_fn(dst, dst_agent, src,
src_agent, size, num_dep_signals,
} else {
status = hsa_amd_memory_async_copy_fn(dst, dst_agent, src, src_agent, size, num_dep_signals,
dep_signals, completion_signal);
}
return status;
@@ -616,24 +616,18 @@ hsa_status_t hsa_amd_memory_async_copy_rect_interceptor(
const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src,
const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent,
hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
hsa_signal_t completion_signal)
{
hsa_signal_t completion_signal) {
hsa_status_t status = HSA_STATUS_SUCCESS;
if (hsa_support::async_copy_callback_enabled) {
trace_entry_t* entry = trace_buffer->GetEntry();
::proxy::Tracker::Enable(COPY_ENTRY_TYPE, hsa_agent_t{}, completion_signal, entry);
status = hsa_amd_memory_async_copy_rect_fn(dst, dst_offset, src,
src_offset, range, copy_agent,
dir, num_dep_signals, dep_signals,
entry->signal);
status = hsa_amd_memory_async_copy_rect_fn(dst, dst_offset, src, src_offset, range, copy_agent,
dir, num_dep_signals, dep_signals, entry->signal);
if (status != HSA_STATUS_SUCCESS) ::proxy::Tracker::Disable(entry);
}
else
{
status = hsa_amd_memory_async_copy_rect_fn(dst, dst_offset, src,
src_offset, range, copy_agent,
dir, num_dep_signals, dep_signals,
completion_signal);
} else {
status =
hsa_amd_memory_async_copy_rect_fn(dst, dst_offset, src, src_offset, range, copy_agent, dir,
num_dep_signals, dep_signals, completion_signal);
}
return status;
}
@@ -685,11 +679,7 @@ PUBLIC_API const char* roctracer_error_string() {
// Return Op string by given domain and activity/API codes
// NULL returned on the error and the library errno is set
PUBLIC_API const char* roctracer_op_string(
uint32_t domain,
uint32_t op,
uint32_t kind)
{
PUBLIC_API const char* roctracer_op_string(uint32_t domain, uint32_t op, uint32_t kind) {
API_METHOD_PREFIX
switch (domain) {
case ACTIVITY_DOMAIN_HSA_API:
@@ -711,18 +701,15 @@ PUBLIC_API const char* roctracer_op_string(
}
// Return Op code and kind by given string
PUBLIC_API roctracer_status_t roctracer_op_code(
uint32_t domain,
const char* str,
uint32_t* op,
uint32_t* kind)
{
PUBLIC_API roctracer_status_t roctracer_op_code(uint32_t domain, const char* str, uint32_t* op,
uint32_t* kind) {
API_METHOD_PREFIX
switch (domain) {
case ACTIVITY_DOMAIN_HSA_API: {
*op = roctracer::hsa_support::GetApiCode(str);
if (*op == HSA_API_ID_NUMBER) {
EXC_RAISING(ROCTRACER_STATUS_BAD_PARAMETER, "Invalid API name \"" << str << "\", domain ID(" << domain << ")");
EXC_RAISING(ROCTRACER_STATUS_BAD_PARAMETER,
"Invalid API name \"" << str << "\", domain ID(" << domain << ")");
}
if (kind != NULL) *kind = 0;
break;
@@ -730,7 +717,8 @@ PUBLIC_API roctracer_status_t roctracer_op_code(
case ACTIVITY_DOMAIN_HIP_API: {
*op = hipApiIdByName(str);
if (*op == HIP_API_ID_NONE) {
EXC_RAISING(ROCTRACER_STATUS_BAD_PARAMETER, "Invalid API name \"" << str << "\", domain ID(" << domain << ")");
EXC_RAISING(ROCTRACER_STATUS_BAD_PARAMETER,
"Invalid API name \"" << str << "\", domain ID(" << domain << ")");
}
if (kind != NULL) *kind = 0;
break;
@@ -743,13 +731,20 @@ PUBLIC_API roctracer_status_t roctracer_op_code(
static inline uint32_t get_op_begin(uint32_t domain) {
switch (domain) {
case ACTIVITY_DOMAIN_HSA_OPS: return 0;
case ACTIVITY_DOMAIN_HSA_API: return 0;
case ACTIVITY_DOMAIN_HSA_EVT: return 0;
case ACTIVITY_DOMAIN_HCC_OPS: return 0;
case ACTIVITY_DOMAIN_HIP_API: return HIP_API_ID_FIRST;
case ACTIVITY_DOMAIN_EXT_API: return 0;
case ACTIVITY_DOMAIN_ROCTX: return 0;
case ACTIVITY_DOMAIN_HSA_OPS:
return 0;
case ACTIVITY_DOMAIN_HSA_API:
return 0;
case ACTIVITY_DOMAIN_HSA_EVT:
return 0;
case ACTIVITY_DOMAIN_HCC_OPS:
return 0;
case ACTIVITY_DOMAIN_HIP_API:
return HIP_API_ID_FIRST;
case ACTIVITY_DOMAIN_EXT_API:
return 0;
case ACTIVITY_DOMAIN_ROCTX:
return 0;
default:
EXC_RAISING(ROCTRACER_STATUS_BAD_DOMAIN, "invalid domain ID(" << domain << ")");
}
@@ -758,13 +753,21 @@ static inline uint32_t get_op_begin(uint32_t domain) {
static inline uint32_t get_op_end(uint32_t domain) {
switch (domain) {
case ACTIVITY_DOMAIN_HSA_OPS: return HSA_OP_ID_NUMBER;
case ACTIVITY_DOMAIN_HSA_API: return HSA_API_ID_NUMBER;
case ACTIVITY_DOMAIN_HSA_EVT: return HSA_EVT_ID_NUMBER;
case ACTIVITY_DOMAIN_HCC_OPS: return HIP_OP_ID_NUMBER;
case ACTIVITY_DOMAIN_HIP_API: return HIP_API_ID_LAST + 1;;
case ACTIVITY_DOMAIN_EXT_API: return 0;
case ACTIVITY_DOMAIN_ROCTX: return ROCTX_API_ID_NUMBER;
case ACTIVITY_DOMAIN_HSA_OPS:
return HSA_OP_ID_NUMBER;
case ACTIVITY_DOMAIN_HSA_API:
return HSA_API_ID_NUMBER;
case ACTIVITY_DOMAIN_HSA_EVT:
return HSA_EVT_ID_NUMBER;
case ACTIVITY_DOMAIN_HCC_OPS:
return HIP_OP_ID_NUMBER;
case ACTIVITY_DOMAIN_HIP_API:
return HIP_API_ID_LAST + 1;
;
case ACTIVITY_DOMAIN_EXT_API:
return 0;
case ACTIVITY_DOMAIN_ROCTX:
return ROCTX_API_ID_NUMBER;
default:
EXC_RAISING(ROCTRACER_STATUS_BAD_DOMAIN, "invalid domain ID(" << domain << ")");
}
@@ -772,14 +775,12 @@ static inline uint32_t get_op_end(uint32_t domain) {
}
// Enable runtime API callbacks
static roctracer_status_t roctracer_enable_callback_fun(
roctracer_domain_t domain,
uint32_t op,
roctracer_rtapi_callback_t callback,
void* user_data)
{
static roctracer_status_t roctracer_enable_callback_fun(roctracer_domain_t domain, uint32_t op,
roctracer_rtapi_callback_t callback,
void* user_data) {
switch (domain) {
case ACTIVITY_DOMAIN_HSA_OPS: break;
case ACTIVITY_DOMAIN_HSA_OPS:
break;
case ACTIVITY_DOMAIN_HSA_API: {
#if 0
if (op == HSA_API_ID_DISPATCH) {
@@ -792,28 +793,42 @@ static roctracer_status_t roctracer_enable_callback_fun(
break;
}
case ACTIVITY_DOMAIN_HSA_EVT: {
const bool succ = roctracer::RocpLoader::Instance().RegisterEvtCallback(op, (void*)callback, user_data);
if (succ == false) HCC_EXC_RAISING(ROCTRACER_STATUS_HSA_ERR, "HSA::RegisterEvtCallback error(" << op << ") failed");
const bool succ =
roctracer::RocpLoader::Instance().RegisterEvtCallback(op, (void*)callback, user_data);
if (succ == false)
HCC_EXC_RAISING(ROCTRACER_STATUS_HSA_ERR,
"HSA::RegisterEvtCallback error(" << op << ") failed");
break;
}
case ACTIVITY_DOMAIN_HCC_OPS: break;
case ACTIVITY_DOMAIN_HCC_OPS:
break;
case ACTIVITY_DOMAIN_HIP_API: {
if (roctracer::HipLoader::Instance().Enabled() == false) break;
std::lock_guard<roctracer::hip_activity_mutex_t> lock(roctracer::hip_activity_mutex);
hipError_t hip_err = roctracer::HipLoader::Instance().RegisterApiCallback(op, (void*)callback, user_data);
if (hip_err != hipSuccess) HIP_EXC_RAISING(ROCTRACER_STATUS_HIP_API_ERR, "HIP::RegisterApiCallback(" << op << ") error(" << hip_err << ")");
hipError_t hip_err =
roctracer::HipLoader::Instance().RegisterApiCallback(op, (void*)callback, user_data);
if (hip_err != hipSuccess)
HIP_EXC_RAISING(ROCTRACER_STATUS_HIP_API_ERR,
"HIP::RegisterApiCallback(" << op << ") error(" << hip_err << ")");
if (roctracer::HipApiActivityEnableCheck(op) == 0) {
hip_err = roctracer::HipLoader::Instance().RegisterActivityCallback(op, (void*)roctracer::HIP_SyncApiDataCallback, (void*)1);
if (hip_err != hipSuccess) HIP_EXC_RAISING(ROCTRACER_STATUS_HIP_API_ERR, "HIPAPI: HIP::RegisterActivityCallback(" << op << ") error(" << hip_err << ")");
hip_err = roctracer::HipLoader::Instance().RegisterActivityCallback(
op, (void*)roctracer::HIP_SyncApiDataCallback, (void*)1);
if (hip_err != hipSuccess)
HIP_EXC_RAISING(
ROCTRACER_STATUS_HIP_API_ERR,
"HIPAPI: HIP::RegisterActivityCallback(" << op << ") error(" << hip_err << ")");
}
break;
}
case ACTIVITY_DOMAIN_ROCTX: {
if (roctracer::RocTxLoader::Instance().Enabled()) {
const bool suc = roctracer::RocTxLoader::Instance().RegisterApiCallback(op, (void*)callback, user_data);
if (suc == false) EXC_RAISING(ROCTRACER_STATUS_ROCTX_ERR, "ROCTX::RegisterApiCallback(" << op << ") failed");
const bool suc =
roctracer::RocTxLoader::Instance().RegisterApiCallback(op, (void*)callback, user_data);
if (suc == false)
EXC_RAISING(ROCTRACER_STATUS_ROCTX_ERR,
"ROCTX::RegisterApiCallback(" << op << ") failed");
}
break;
}
@@ -823,32 +838,23 @@ static roctracer_status_t roctracer_enable_callback_fun(
return ROCTRACER_STATUS_SUCCESS;
}
static void roctracer_enable_callback_impl(
roctracer_domain_t domain,
uint32_t op,
roctracer_rtapi_callback_t callback,
void* user_data)
{
static void roctracer_enable_callback_impl(roctracer_domain_t domain, uint32_t op,
roctracer_rtapi_callback_t callback, void* user_data) {
roctracer::cb_journal->Insert(domain, op, {callback, user_data});
roctracer_enable_callback_fun(domain, op, callback, user_data);
}
PUBLIC_API roctracer_status_t roctracer_enable_op_callback(
roctracer_domain_t domain,
uint32_t op,
roctracer_rtapi_callback_t callback,
void* user_data)
{
PUBLIC_API roctracer_status_t roctracer_enable_op_callback(roctracer_domain_t domain, uint32_t op,
roctracer_rtapi_callback_t callback,
void* user_data) {
API_METHOD_PREFIX
roctracer_enable_callback_impl(domain, op, callback, user_data);
API_METHOD_SUFFIX
}
PUBLIC_API roctracer_status_t roctracer_enable_domain_callback(
roctracer_domain_t domain,
roctracer_rtapi_callback_t callback,
void* user_data)
{
PUBLIC_API roctracer_status_t roctracer_enable_domain_callback(roctracer_domain_t domain,
roctracer_rtapi_callback_t callback,
void* user_data) {
API_METHOD_PREFIX
const uint32_t op_end = get_op_end(domain);
for (uint32_t op = get_op_begin(domain); op < op_end; ++op)
@@ -856,10 +862,8 @@ PUBLIC_API roctracer_status_t roctracer_enable_domain_callback(
API_METHOD_SUFFIX
}
PUBLIC_API roctracer_status_t roctracer_enable_callback(
roctracer_rtapi_callback_t callback,
void* user_data)
{
PUBLIC_API roctracer_status_t roctracer_enable_callback(roctracer_rtapi_callback_t callback,
void* user_data) {
API_METHOD_PREFIX
for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; ++domain) {
const uint32_t op_end = get_op_end(domain);
@@ -870,12 +874,10 @@ PUBLIC_API roctracer_status_t roctracer_enable_callback(
}
// Disable runtime API callbacks
static roctracer_status_t roctracer_disable_callback_fun(
roctracer_domain_t domain,
uint32_t op)
{
static roctracer_status_t roctracer_disable_callback_fun(roctracer_domain_t domain, uint32_t op) {
switch (domain) {
case ACTIVITY_DOMAIN_HSA_OPS: break;
case ACTIVITY_DOMAIN_HSA_OPS:
break;
case ACTIVITY_DOMAIN_HSA_API: {
#if 0
if (op == HSA_API_ID_DISPATCH) {
@@ -887,29 +889,38 @@ static roctracer_status_t roctracer_disable_callback_fun(
roctracer::hsa_support::cb_table.set(op, NULL, NULL);
break;
}
case ACTIVITY_DOMAIN_HCC_OPS: break;
case ACTIVITY_DOMAIN_HCC_OPS:
break;
case ACTIVITY_DOMAIN_HIP_API: {
if (roctracer::HipLoader::Instance().Enabled() == false) break;
std::lock_guard<roctracer::hip_activity_mutex_t> lock(roctracer::hip_activity_mutex);
const hipError_t hip_err = roctracer::HipLoader::Instance().RemoveApiCallback(op);
if (hip_err != hipSuccess) HIP_EXC_RAISING(ROCTRACER_STATUS_HIP_API_ERR, "HIP::RemoveApiCallback(" << op << "), error(" << hip_err << ")");
if (hip_err != hipSuccess)
HIP_EXC_RAISING(ROCTRACER_STATUS_HIP_API_ERR,
"HIP::RemoveApiCallback(" << op << "), error(" << hip_err << ")");
if (roctracer::HipApiActivityDisableCheck(op) == 0) {
const hipError_t hip_err = roctracer::HipLoader::Instance().RemoveActivityCallback(op);
if (hip_err != hipSuccess) HIP_EXC_RAISING(ROCTRACER_STATUS_HIP_API_ERR, "HIPAPI: HIP::RemoveActivityCallback op(" << op << "), error(" << hip_err << ")");
if (hip_err != hipSuccess)
HIP_EXC_RAISING(
ROCTRACER_STATUS_HIP_API_ERR,
"HIPAPI: HIP::RemoveActivityCallback op(" << op << "), error(" << hip_err << ")");
}
break;
}
case ACTIVITY_DOMAIN_HSA_EVT: {
const bool succ = roctracer::RocpLoader::Instance().RemoveEvtCallback(op);
if (succ == false) HCC_EXC_RAISING(ROCTRACER_STATUS_HSA_ERR, "HSA::RemoveEvtCallback error(" << op << ") failed");
if (succ == false)
HCC_EXC_RAISING(ROCTRACER_STATUS_HSA_ERR,
"HSA::RemoveEvtCallback error(" << op << ") failed");
break;
}
case ACTIVITY_DOMAIN_ROCTX: {
if (roctracer::RocTxLoader::Instance().Enabled()) {
const bool suc = roctracer::RocTxLoader::Instance().RemoveApiCallback(op);
if (suc == false) EXC_RAISING(ROCTRACER_STATUS_ROCTX_ERR, "ROCTX::RemoveApiCallback(" << op << ") failed");
if (suc == false)
EXC_RAISING(ROCTRACER_STATUS_ROCTX_ERR, "ROCTX::RemoveApiCallback(" << op << ") failed");
}
break;
}
@@ -919,26 +930,19 @@ static roctracer_status_t roctracer_disable_callback_fun(
return ROCTRACER_STATUS_SUCCESS;
}
static void roctracer_disable_callback_impl(
roctracer_domain_t domain,
uint32_t op)
{
roctracer::cb_journal->Remove(domain, op);
roctracer_disable_callback_fun(domain, op);
static void roctracer_disable_callback_impl(roctracer_domain_t domain, uint32_t op) {
roctracer::cb_journal->Remove(domain, op);
roctracer_disable_callback_fun(domain, op);
}
PUBLIC_API roctracer_status_t roctracer_disable_op_callback(
roctracer_domain_t domain,
uint32_t op)
{
PUBLIC_API roctracer_status_t roctracer_disable_op_callback(roctracer_domain_t domain,
uint32_t op) {
API_METHOD_PREFIX
roctracer_disable_callback_impl(domain, op);
API_METHOD_SUFFIX
}
PUBLIC_API roctracer_status_t roctracer_disable_domain_callback(
roctracer_domain_t domain)
{
PUBLIC_API roctracer_status_t roctracer_disable_domain_callback(roctracer_domain_t domain) {
API_METHOD_PREFIX
const uint32_t op_end = get_op_end(domain);
for (uint32_t op = get_op_begin(domain); op < op_end; ++op)
@@ -946,8 +950,7 @@ PUBLIC_API roctracer_status_t roctracer_disable_domain_callback(
API_METHOD_SUFFIX
}
PUBLIC_API roctracer_status_t roctracer_disable_callback()
{
PUBLIC_API roctracer_status_t roctracer_disable_callback() {
API_METHOD_PREFIX
for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; ++domain) {
const uint32_t op_end = get_op_end(domain);
@@ -966,10 +969,8 @@ PUBLIC_API roctracer_pool_t* roctracer_default_pool_expl(roctracer_pool_t* pool)
}
// Open memory pool
PUBLIC_API roctracer_status_t roctracer_open_pool_expl(
const roctracer_properties_t* properties,
roctracer_pool_t** pool)
{
PUBLIC_API roctracer_status_t roctracer_open_pool_expl(const roctracer_properties_t* properties,
roctracer_pool_t** pool) {
API_METHOD_PREFIX
std::lock_guard<roctracer::memory_pool_mutex_t> lock(roctracer::memory_pool_mutex);
if ((pool == NULL) && (roctracer::memory_pool != NULL)) {
@@ -977,8 +978,10 @@ PUBLIC_API roctracer_status_t roctracer_open_pool_expl(
}
roctracer::MemoryPool* p = new roctracer::MemoryPool(*properties);
if (p == NULL) EXC_RAISING(ROCTRACER_STATUS_ERROR, "MemoryPool() error");
if (pool != NULL) *pool = p;
else roctracer::memory_pool = p;
if (pool != NULL)
*pool = p;
else
roctracer::memory_pool = p;
API_METHOD_SUFFIX
}
@@ -988,17 +991,14 @@ PUBLIC_API roctracer_status_t roctracer_close_pool_expl(roctracer_pool_t* pool)
std::lock_guard<roctracer::memory_pool_mutex_t> lock(roctracer::memory_pool_mutex);
roctracer_pool_t* ptr = (pool == NULL) ? roctracer_default_pool() : pool;
roctracer::MemoryPool* memory_pool = reinterpret_cast<roctracer::MemoryPool*>(ptr);
delete(memory_pool);
delete (memory_pool);
if (pool == NULL) roctracer::memory_pool = NULL;
API_METHOD_SUFFIX
}
// Enable activity records logging
static roctracer_status_t roctracer_enable_activity_fun(
roctracer_domain_t domain,
uint32_t op,
roctracer_pool_t* pool)
{
static roctracer_status_t roctracer_enable_activity_fun(roctracer_domain_t domain, uint32_t op,
roctracer_pool_t* pool) {
if (pool == NULL) pool = roctracer_default_pool();
switch (domain) {
case ACTIVITY_DOMAIN_HSA_OPS: {
@@ -1008,27 +1008,31 @@ static roctracer_status_t roctracer_enable_activity_fun(
const bool init_phase = (roctracer::RocpLoader::GetRef() == NULL);
if (roctracer::RocpLoader::GetRef() == NULL) break;
if (init_phase == true) {
roctracer::RocpLoader::Instance().InitActivityCallback((void*)roctracer::HSA_AsyncActivityCallback,
(void*)pool);
roctracer::RocpLoader::Instance().InitActivityCallback(
(void*)roctracer::HSA_AsyncActivityCallback, (void*)pool);
}
const bool succ = roctracer::RocpLoader::Instance().EnableActivityCallback(op, true);
if (succ == false) HCC_EXC_RAISING(ROCTRACER_STATUS_HSA_ERR, "HSA::EnableActivityCallback error");
if (succ == false)
HCC_EXC_RAISING(ROCTRACER_STATUS_HSA_ERR, "HSA::EnableActivityCallback error");
}
break;
}
case ACTIVITY_DOMAIN_HSA_API: break;
case ACTIVITY_DOMAIN_HSA_EVT: break;
case ACTIVITY_DOMAIN_HSA_API:
break;
case ACTIVITY_DOMAIN_HSA_EVT:
break;
case ACTIVITY_DOMAIN_HCC_OPS: {
const bool init_phase = (roctracer::HccLoader::GetRef() == NULL);
if (roctracer::HccLoader::Instance().Enabled() == false) break;
if (init_phase == true) {
roctracer::HccLoader::Instance().InitActivityCallback((void*)roctracer::HCC_ActivityIdCallback,
(void*)roctracer::HCC_AsyncActivityCallback,
(void*)pool);
roctracer::HccLoader::Instance().InitActivityCallback(
(void*)roctracer::HCC_ActivityIdCallback, (void*)roctracer::HCC_AsyncActivityCallback,
(void*)pool);
}
const bool succ = roctracer::HccLoader::Instance().EnableActivityCallback(op, true);
if (succ == false) HCC_EXC_RAISING(ROCTRACER_STATUS_HCC_OPS_ERR, "HCC::EnableActivityCallback error");
if (succ == false)
HCC_EXC_RAISING(ROCTRACER_STATUS_HCC_OPS_ERR, "HCC::EnableActivityCallback error");
break;
}
case ACTIVITY_DOMAIN_HIP_API: {
@@ -1036,41 +1040,38 @@ static roctracer_status_t roctracer_enable_activity_fun(
std::lock_guard<roctracer::hip_activity_mutex_t> lock(roctracer::hip_activity_mutex);
if (roctracer::HipActActivityEnableCheck(op) == 0) {
const hipError_t hip_err = roctracer::HipLoader::Instance().RegisterActivityCallback(op, (void*)roctracer::HIP_SyncActivityCallback, (void*)pool);
if (hip_err != hipSuccess) HIP_EXC_RAISING(ROCTRACER_STATUS_HIP_API_ERR, "HIP::RegisterActivityCallback(" << op << " error(" << hip_err << ")");
const hipError_t hip_err = roctracer::HipLoader::Instance().RegisterActivityCallback(
op, (void*)roctracer::HIP_SyncActivityCallback, (void*)pool);
if (hip_err != hipSuccess)
HIP_EXC_RAISING(ROCTRACER_STATUS_HIP_API_ERR,
"HIP::RegisterActivityCallback(" << op << " error(" << hip_err << ")");
}
break;
}
case ACTIVITY_DOMAIN_ROCTX: break;
case ACTIVITY_DOMAIN_ROCTX:
break;
default:
EXC_RAISING(ROCTRACER_STATUS_BAD_DOMAIN, "invalid domain ID(" << domain << ")");
}
return ROCTRACER_STATUS_SUCCESS;
}
static void roctracer_enable_activity_impl(
roctracer_domain_t domain,
uint32_t op,
roctracer_pool_t* pool)
{
roctracer::act_journal->Insert(domain, op, {pool});
roctracer_enable_activity_fun(domain, op, pool);
static void roctracer_enable_activity_impl(roctracer_domain_t domain, uint32_t op,
roctracer_pool_t* pool) {
roctracer::act_journal->Insert(domain, op, {pool});
roctracer_enable_activity_fun(domain, op, pool);
}
PUBLIC_API roctracer_status_t roctracer_enable_op_activity_expl(
roctracer_domain_t domain,
uint32_t op,
roctracer_pool_t* pool)
{
PUBLIC_API roctracer_status_t roctracer_enable_op_activity_expl(roctracer_domain_t domain,
uint32_t op,
roctracer_pool_t* pool) {
API_METHOD_PREFIX
roctracer_enable_activity_impl(domain, op, pool);
API_METHOD_SUFFIX
}
PUBLIC_API roctracer_status_t roctracer_enable_domain_activity_expl(
roctracer_domain_t domain,
roctracer_pool_t* pool)
{
PUBLIC_API roctracer_status_t roctracer_enable_domain_activity_expl(roctracer_domain_t domain,
roctracer_pool_t* pool) {
API_METHOD_PREFIX
const uint32_t op_end = get_op_end(domain);
for (uint32_t op = get_op_begin(domain); op < op_end; ++op)
@@ -1078,9 +1079,7 @@ PUBLIC_API roctracer_status_t roctracer_enable_domain_activity_expl(
API_METHOD_SUFFIX
}
PUBLIC_API roctracer_status_t roctracer_enable_activity_expl(
roctracer_pool_t* pool)
{
PUBLIC_API roctracer_status_t roctracer_enable_activity_expl(roctracer_pool_t* pool) {
API_METHOD_PREFIX
for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; ++domain) {
const uint32_t op_end = get_op_end(domain);
@@ -1091,10 +1090,7 @@ PUBLIC_API roctracer_status_t roctracer_enable_activity_expl(
}
// Disable activity records logging
static roctracer_status_t roctracer_disable_activity_fun(
roctracer_domain_t domain,
uint32_t op)
{
static roctracer_status_t roctracer_disable_activity_fun(roctracer_domain_t domain, uint32_t op) {
switch (domain) {
case ACTIVITY_DOMAIN_HSA_OPS: {
if (op == HSA_OP_ID_COPY) {
@@ -1102,17 +1098,23 @@ static roctracer_status_t roctracer_disable_activity_fun(
} else {
if (roctracer::RocpLoader::GetRef() == NULL) break;
const bool succ = roctracer::RocpLoader::Instance().EnableActivityCallback(op, false);
if (succ == false) HCC_EXC_RAISING(ROCTRACER_STATUS_HSA_ERR, "HSA::EnableActivityCallback(false) error, op(" << op << ")");
if (succ == false)
HCC_EXC_RAISING(ROCTRACER_STATUS_HSA_ERR,
"HSA::EnableActivityCallback(false) error, op(" << op << ")");
}
break;
}
case ACTIVITY_DOMAIN_HSA_API: break;
case ACTIVITY_DOMAIN_HSA_EVT: break;
case ACTIVITY_DOMAIN_HSA_API:
break;
case ACTIVITY_DOMAIN_HSA_EVT:
break;
case ACTIVITY_DOMAIN_HCC_OPS: {
if (roctracer::HccLoader::Instance().Enabled() == false) break;
const bool succ = roctracer::HccLoader::Instance().EnableActivityCallback(op, false);
if (succ == false) HCC_EXC_RAISING(ROCTRACER_STATUS_HCC_OPS_ERR, "HCC::EnableActivityCallback(NULL) error, op(" << op << ")");
if (succ == false)
HCC_EXC_RAISING(ROCTRACER_STATUS_HCC_OPS_ERR,
"HCC::EnableActivityCallback(NULL) error, op(" << op << ")");
break;
}
case ACTIVITY_DOMAIN_HIP_API: {
@@ -1121,40 +1123,40 @@ static roctracer_status_t roctracer_disable_activity_fun(
if (roctracer::HipActActivityDisableCheck(op) == 0) {
const hipError_t hip_err = roctracer::HipLoader::Instance().RemoveActivityCallback(op);
if (hip_err != hipSuccess) HIP_EXC_RAISING(ROCTRACER_STATUS_HIP_API_ERR, "HIP::RemoveActivityCallback op(" << op << "), error(" << hip_err << ")");
if (hip_err != hipSuccess)
HIP_EXC_RAISING(ROCTRACER_STATUS_HIP_API_ERR,
"HIP::RemoveActivityCallback op(" << op << "), error(" << hip_err << ")");
} else {
const hipError_t hip_err = roctracer::HipLoader::Instance().RegisterActivityCallback(op, (void*)roctracer::HIP_SyncApiDataCallback, (void*)1);
if (hip_err != hipSuccess) HIP_EXC_RAISING(ROCTRACER_STATUS_HIP_API_ERR, "HIPACT: HIP::RegisterActivityCallback(" << op << ") error(" << hip_err << ")");
const hipError_t hip_err = roctracer::HipLoader::Instance().RegisterActivityCallback(
op, (void*)roctracer::HIP_SyncApiDataCallback, (void*)1);
if (hip_err != hipSuccess)
HIP_EXC_RAISING(
ROCTRACER_STATUS_HIP_API_ERR,
"HIPACT: HIP::RegisterActivityCallback(" << op << ") error(" << hip_err << ")");
}
break;
}
case ACTIVITY_DOMAIN_ROCTX: break;
case ACTIVITY_DOMAIN_ROCTX:
break;
default:
EXC_RAISING(ROCTRACER_STATUS_BAD_DOMAIN, "invalid domain ID(" << domain << ")");
}
return ROCTRACER_STATUS_SUCCESS;
}
static void roctracer_disable_activity_impl(
roctracer_domain_t domain,
uint32_t op)
{
static void roctracer_disable_activity_impl(roctracer_domain_t domain, uint32_t op) {
roctracer::act_journal->Remove(domain, op);
roctracer_disable_activity_fun(domain, op);
}
PUBLIC_API roctracer_status_t roctracer_disable_op_activity(
roctracer_domain_t domain,
uint32_t op)
{
PUBLIC_API roctracer_status_t roctracer_disable_op_activity(roctracer_domain_t domain,
uint32_t op) {
API_METHOD_PREFIX
roctracer_disable_activity_impl(domain, op);
API_METHOD_SUFFIX
}
PUBLIC_API roctracer_status_t roctracer_disable_domain_activity(
roctracer_domain_t domain)
{
PUBLIC_API roctracer_status_t roctracer_disable_domain_activity(roctracer_domain_t domain) {
API_METHOD_PREFIX
const uint32_t op_end = get_op_end(domain);
for (uint32_t op = get_op_begin(domain); op < op_end; ++op)
@@ -1162,8 +1164,7 @@ PUBLIC_API roctracer_status_t roctracer_disable_domain_activity(
API_METHOD_SUFFIX
}
PUBLIC_API roctracer_status_t roctracer_disable_activity()
{
PUBLIC_API roctracer_status_t roctracer_disable_activity() {
API_METHOD_PREFIX
for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; ++domain) {
const uint32_t op_end = get_op_end(domain);
@@ -1185,7 +1186,8 @@ PUBLIC_API roctracer_status_t roctracer_flush_activity_expl(roctracer_pool_t* po
// Notifies that the calling thread is entering an external API region.
// Push an external correlation id for the calling thread.
PUBLIC_API roctracer_status_t roctracer_activity_push_external_correlation_id(activity_correlation_id_t id) {
PUBLIC_API roctracer_status_t
roctracer_activity_push_external_correlation_id(activity_correlation_id_t id) {
API_METHOD_PREFIX
roctracer::external_id_stack.push(id);
API_METHOD_SUFFIX
@@ -1194,7 +1196,8 @@ PUBLIC_API roctracer_status_t roctracer_activity_push_external_correlation_id(ac
// Notifies that the calling thread is leaving an external API region.
// Pop an external correlation id for the calling thread.
// 'lastId' returns the last external correlation
PUBLIC_API roctracer_status_t roctracer_activity_pop_external_correlation_id(activity_correlation_id_t* last_id) {
PUBLIC_API roctracer_status_t
roctracer_activity_pop_external_correlation_id(activity_correlation_id_t* last_id) {
API_METHOD_PREFIX
if (last_id != NULL) *last_id = 0;
@@ -1214,7 +1217,8 @@ PUBLIC_API roctracer_status_t roctracer_activity_pop_external_correlation_id(act
PUBLIC_API void roctracer_mark(const char* str) {
if (mark_api_callback_ptr) {
mark_api_callback_ptr(ACTIVITY_DOMAIN_EXT_API, ACTIVITY_EXT_OP_MARK, str, NULL);
roctracer::GlobalCounter::Increment(); // account for user-defined markers when tracking correlation id
roctracer::GlobalCounter::Increment(); // account for user-defined markers when tracking
// correlation id
}
}
@@ -1245,15 +1249,14 @@ PUBLIC_API roctracer_status_t roctracer_get_timestamp(uint64_t* timestamp) {
}
// Set properties
PUBLIC_API roctracer_status_t roctracer_set_properties(
roctracer_domain_t domain,
void* properties)
{
PUBLIC_API roctracer_status_t roctracer_set_properties(roctracer_domain_t domain,
void* properties) {
API_METHOD_PREFIX
switch (domain) {
case ACTIVITY_DOMAIN_HSA_OPS: {
// HSA OPS properties
roctracer::hsa_ops_properties_t* ops_properties = reinterpret_cast<roctracer::hsa_ops_properties_t*>(properties);
roctracer::hsa_ops_properties_t* ops_properties =
reinterpret_cast<roctracer::hsa_ops_properties_t*>(properties);
HsaApiTable* table = reinterpret_cast<HsaApiTable*>(ops_properties->table);
roctracer::hsa_support::async_copy_callback_fun = ops_properties->async_copy_callback_fun;
roctracer::hsa_support::async_copy_callback_arg = ops_properties->async_copy_callback_arg;
@@ -1271,9 +1274,12 @@ PUBLIC_API roctracer_status_t roctracer_set_properties(
hsa_status_t status = hsa_amd_profiling_async_copy_enable(true);
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_amd_profiling_async_copy_enable");
roctracer::hsa_amd_memory_async_copy_fn = table->amd_ext_->hsa_amd_memory_async_copy_fn;
roctracer::hsa_amd_memory_async_copy_rect_fn = table->amd_ext_->hsa_amd_memory_async_copy_rect_fn;
table->amd_ext_->hsa_amd_memory_async_copy_fn = roctracer::hsa_amd_memory_async_copy_interceptor;
table->amd_ext_->hsa_amd_memory_async_copy_rect_fn = roctracer::hsa_amd_memory_async_copy_rect_interceptor;
roctracer::hsa_amd_memory_async_copy_rect_fn =
table->amd_ext_->hsa_amd_memory_async_copy_rect_fn;
table->amd_ext_->hsa_amd_memory_async_copy_fn =
roctracer::hsa_amd_memory_async_copy_interceptor;
table->amd_ext_->hsa_amd_memory_async_copy_rect_fn =
roctracer::hsa_amd_memory_async_copy_rect_interceptor;
break;
}
@@ -1291,11 +1297,13 @@ PUBLIC_API roctracer_status_t roctracer_set_properties(
case ACTIVITY_DOMAIN_HCC_OPS:
case ACTIVITY_DOMAIN_HIP_API: {
mark_api_callback_ptr = reinterpret_cast<mark_api_callback_t*>(properties);
if (roctracer::hip_act_cb_tracker == NULL) roctracer::hip_act_cb_tracker = new roctracer::hip_act_cb_tracker_t;
if (roctracer::hip_act_cb_tracker == NULL)
roctracer::hip_act_cb_tracker = new roctracer::hip_act_cb_tracker_t;
break;
}
case ACTIVITY_DOMAIN_EXT_API: {
roctracer_ext_properties_t* ops_properties = reinterpret_cast<roctracer_ext_properties_t*>(properties);
roctracer_ext_properties_t* ops_properties =
reinterpret_cast<roctracer_ext_properties_t*>(properties);
roctracer::ext_support::roctracer_start_cb = ops_properties->start_cb;
roctracer::ext_support::roctracer_stop_cb = ops_properties->stop_cb;
break;
@@ -1349,7 +1357,8 @@ PUBLIC_API void roctracer_flush_buf() {
CONSTRUCTOR_API void constructor() {
ONLOAD_TRACE_BEG();
roctracer::util::Logger::Create();
roctracer::trace_buffer = new roctracer::TraceBuffer<roctracer::trace_entry_t>("HSA GPU", 0x200000, roctracer::trace_buffer_prm, 2);
roctracer::trace_buffer = new roctracer::TraceBuffer<roctracer::trace_entry_t>(
"HSA GPU", 0x200000, roctracer::trace_buffer_prm, 2);
roctracer_load();
ONLOAD_TRACE_END();
}
+36 -36
Просмотреть файл
@@ -50,11 +50,7 @@
} while (0)
namespace roctracer {
enum {
TRACE_ENTRY_INV = 0,
TRACE_ENTRY_INIT = 1,
TRACE_ENTRY_COMPL = 2
};
enum { TRACE_ENTRY_INV = 0, TRACE_ENTRY_INIT = 1, TRACE_ENTRY_COMPL = 2 };
enum entry_type_t {
DFLT_ENTRY_TYPE = 0,
@@ -68,8 +64,8 @@ struct trace_entry_t {
std::atomic<uint32_t> valid;
entry_type_t type;
uint64_t dispatch;
uint64_t begin; // kernel begin timestamp, ns
uint64_t end; // kernel end timestamp, ns
uint64_t begin; // kernel begin timestamp, ns
uint64_t end; // kernel end timestamp, ns
uint64_t complete;
hsa_agent_t agent;
uint32_t dev_index;
@@ -86,8 +82,7 @@ struct trace_entry_t {
};
};
template <class T>
struct push_element_fun {
template <class T> struct push_element_fun {
T* const elem_;
T** prev_;
bool fun(T* node) {
@@ -105,10 +100,12 @@ struct push_element_fun {
push_element_fun(T* elem, T** prev) : elem_(elem), prev_(prev) {}
};
template <class T>
struct call_element_fun {
template <class T> struct call_element_fun {
void (T::*fptr_)();
bool fun(T* node) const { (node->*fptr_)(); return false; }
bool fun(T* node) const {
(node->*fptr_)();
return false;
}
call_element_fun(void (T::*f)()) : fptr_(f) {}
};
@@ -118,18 +115,26 @@ struct TraceBufferBase {
virtual void StartWorkerThread() = 0;
virtual void Flush() = 0;
static void StartWorkerThreadAll() { foreach(call_element_fun<TraceBufferBase>(&TraceBufferBase::StartWorkerThread)); }
static void FlushAll() { foreach(call_element_fun<TraceBufferBase>(&TraceBufferBase::Flush)); }
static void StartWorkerThreadAll() {
foreach (call_element_fun<TraceBufferBase>(&TraceBufferBase::StartWorkerThread))
;
}
static void FlushAll() {
foreach (call_element_fun<TraceBufferBase>(&TraceBufferBase::Flush))
;
}
static void Push(TraceBufferBase* elem) {
if (head_elem_ == NULL) head_elem_ = elem;
else foreach(push_element_fun<TraceBufferBase>(elem, &head_elem_));
if (head_elem_ == NULL)
head_elem_ = elem;
else
foreach (push_element_fun<TraceBufferBase>(elem, &head_elem_))
;
}
TraceBufferBase(const uint32_t& prior) : priority_(prior), next_elem_(NULL) {}
template<class F>
static void foreach(const F& f_in) {
template <class F> static void foreach (const F& f_in) {
std::lock_guard<mutex_t> lck(mutex_);
F f = f_in;
TraceBufferBase* p = head_elem_;
@@ -146,9 +151,8 @@ struct TraceBufferBase {
static mutex_t mutex_;
};
template <typename Entry>
class TraceBuffer : protected TraceBufferBase {
public:
template <typename Entry> class TraceBuffer : protected TraceBufferBase {
public:
typedef void (*callback_t)(Entry*);
typedef TraceBuffer<Entry> Obj;
typedef uint64_t pointer_t;
@@ -161,11 +165,9 @@ class TraceBuffer : protected TraceBufferBase {
callback_t fun;
};
TraceBuffer(const char* name, uint32_t size, const flush_prm_t* flush_prm_arr, uint32_t flush_prm_count, uint32_t prior = 0) :
TraceBufferBase(prior),
size_(size),
work_thread_started_(false)
{
TraceBuffer(const char* name, uint32_t size, const flush_prm_t* flush_prm_arr,
uint32_t flush_prm_count, uint32_t prior = 0)
: TraceBufferBase(prior), size_(size), work_thread_started_(false) {
name_ = strdup(name);
data_ = allocate_fun();
next_ = allocate_fun();
@@ -204,7 +206,7 @@ class TraceBuffer : protected TraceBufferBase {
std::lock_guard<mutex_t> lck(mutex_);
if (work_thread_started_ == true) {
PTHREAD_CALL(pthread_cancel(work_thread_));
void *res;
void* res;
PTHREAD_CALL(pthread_join(work_thread_, &res));
if (res != PTHREAD_CANCELED) FATAL("consumer thread wasn't stopped correctly");
work_thread_started_ = false;
@@ -223,7 +225,7 @@ class TraceBuffer : protected TraceBufferBase {
void Flush() { flush_buf(); }
private:
private:
void flush_buf() {
std::lock_guard<mutex_t> lck(mutex_);
@@ -231,7 +233,7 @@ class TraceBuffer : protected TraceBufferBase {
pointer_t curr_pointer = write_pointer_.load(std::memory_order_relaxed);
buf_list_it_t it = buf_list_.begin();
buf_list_it_t end_it = buf_list_.end();
while(it != end_it) {
while (it != end_it) {
Entry* buf = *it;
Entry* ptr = buf + (pointer % size_);
Entry* end_ptr = buf + size_;
@@ -261,15 +263,13 @@ class TraceBuffer : protected TraceBufferBase {
}
inline Entry* allocate_fun() {
Entry* ptr = (Entry*) malloc(size_ * sizeof(Entry));
Entry* ptr = (Entry*)malloc(size_ * sizeof(Entry));
if (ptr == NULL) FATAL("malloc failed");
//memset(ptr, 0, size_ * sizeof(Entry));
// memset(ptr, 0, size_ * sizeof(Entry));
return ptr;
}
inline void free_fun(void* ptr) {
free(ptr);
}
inline void free_fun(void* ptr) { free(ptr); }
static void* allocate_worker(void* arg) {
Obj* obj = (Obj*)arg;
@@ -321,8 +321,8 @@ class TraceBuffer : protected TraceBufferBase {
};
} // namespace roctracer
#define TRACE_BUFFER_INSTANTIATE() \
roctracer::TraceBufferBase* roctracer::TraceBufferBase::head_elem_ = NULL; \
#define TRACE_BUFFER_INSTANTIATE() \
roctracer::TraceBufferBase* roctracer::TraceBufferBase::head_elem_ = NULL; \
roctracer::TraceBufferBase::mutex_t roctracer::TraceBufferBase::mutex_;
#endif // SRC_CORE_TRACE_BUFFER_H_
+1 -3
Просмотреть файл
@@ -31,9 +31,7 @@ class HsaQueue : public Queue {
public:
HsaQueue(const util::AgentInfo* agent_info, hsa_queue_t* queue) : queue_(queue) {}
void Submit(const packet_t* packet) {
util::HsaRsrcFactory::Instance().Submit(queue_, packet);
}
void Submit(const packet_t* packet) { util::HsaRsrcFactory::Instance().Submit(queue_, packet); }
private:
hsa_queue_t* queue_;
+4 -4
Просмотреть файл
@@ -27,10 +27,10 @@ void InterceptQueue::HsaIntercept(HsaApiTable* table) {
}
InterceptQueue::mutex_t InterceptQueue::mutex_;
//rocprofiler_callback_t InterceptQueue::dispatch_callback_ = NULL;
//InterceptQueue::queue_callback_t InterceptQueue::create_callback_ = NULL;
//InterceptQueue::queue_callback_t InterceptQueue::destroy_callback_ = NULL;
//void* InterceptQueue::callback_data_ = NULL;
// rocprofiler_callback_t InterceptQueue::dispatch_callback_ = NULL;
// InterceptQueue::queue_callback_t InterceptQueue::create_callback_ = NULL;
// InterceptQueue::queue_callback_t InterceptQueue::destroy_callback_ = NULL;
// void* InterceptQueue::callback_data_ = NULL;
InterceptQueue::obj_map_t* InterceptQueue::obj_map_ = NULL;
const char* InterceptQueue::kernel_none_ = "";
bool InterceptQueue::in_create_call_ = false;
+36 -31
Просмотреть файл
@@ -37,7 +37,9 @@
#include "util/hsa_rsrc_factory.h"
#include "util/exception.h"
namespace roctracer { extern TraceBuffer<trace_entry_t>* trace_buffer; }
namespace roctracer {
extern TraceBuffer<trace_entry_t>* trace_buffer;
}
namespace rocprofiler {
extern decltype(hsa_queue_create)* hsa_queue_create_fn;
@@ -48,25 +50,25 @@ class InterceptQueue {
typedef std::recursive_mutex mutex_t;
typedef std::map<uint64_t, InterceptQueue*> obj_map_t;
typedef hsa_status_t (*queue_callback_t)(hsa_queue_t*, void* data);
typedef void (*queue_event_callback_t)(hsa_status_t status, hsa_queue_t *queue, void *arg);
typedef void (*queue_event_callback_t)(hsa_status_t status, hsa_queue_t* queue, void* arg);
typedef uint32_t queue_id_t;
static void HsaIntercept(HsaApiTable* table);
static hsa_status_t InterceptQueueCreate(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type,
void (*callback)(hsa_status_t status, hsa_queue_t* source,
void* data),
void* data, uint32_t private_segment_size,
uint32_t group_segment_size, hsa_queue_t** queue,
const bool& tracker_on) {
static hsa_status_t InterceptQueueCreate(
hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type,
void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), void* data,
uint32_t private_segment_size, uint32_t group_segment_size, hsa_queue_t** queue,
const bool& tracker_on) {
std::lock_guard<mutex_t> lck(mutex_);
hsa_status_t status = HSA_STATUS_ERROR;
if (in_create_call_) EXC_ABORT(status, "recursive InterceptQueueCreate()");
in_create_call_ = true;
ProxyQueue* proxy = ProxyQueue::Create(agent, size, type, queue_event_callback, data, private_segment_size,
group_segment_size, queue, &status);
ProxyQueue* proxy =
ProxyQueue::Create(agent, size, type, queue_event_callback, data, private_segment_size,
group_segment_size, queue, &status);
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "ProxyQueue::Create()");
status = util::HsaRsrcFactory::HsaApi()->hsa_amd_profiling_set_profiler_enabled(*queue, true);
@@ -78,7 +80,8 @@ class InterceptQueue {
obj->queue_id = current_queue_id;
(*obj_map_)[(uint64_t)(*queue)] = obj;
status = (is_enabled) ? proxy->SetInterceptCB(OnSubmitCB, obj) : proxy->SetInterceptCB(OnSubmitCB_dummy, obj);
status = (is_enabled) ? proxy->SetInterceptCB(OnSubmitCB, obj)
: proxy->SetInterceptCB(OnSubmitCB_dummy, obj);
#if 0
if (create_callback_ != NULL) {
@@ -96,15 +99,17 @@ class InterceptQueue {
void* data),
void* data, uint32_t private_segment_size,
uint32_t group_segment_size, hsa_queue_t** queue) {
return InterceptQueueCreate(agent, size, type, callback, data, private_segment_size, group_segment_size, queue, false);
return InterceptQueueCreate(agent, size, type, callback, data, private_segment_size,
group_segment_size, queue, false);
}
static hsa_status_t QueueCreateTracked(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type,
void (*callback)(hsa_status_t status, hsa_queue_t* source,
void* data),
void* data, uint32_t private_segment_size,
uint32_t group_segment_size, hsa_queue_t** queue) {
return InterceptQueueCreate(agent, size, type, callback, data, private_segment_size, group_segment_size, queue, true);
void (*callback)(hsa_status_t status, hsa_queue_t* source,
void* data),
void* data, uint32_t private_segment_size,
uint32_t group_segment_size, hsa_queue_t** queue) {
return InterceptQueueCreate(agent, size, type, callback, data, private_segment_size,
group_segment_size, queue, true);
}
static hsa_status_t QueueDestroy(hsa_queue_t* queue) {
@@ -122,8 +127,8 @@ class InterceptQueue {
return status;
}
static void OnSubmitCB_dummy(const void* in_packets, uint64_t count, uint64_t user_que_idx, void* data,
hsa_amd_queue_intercept_packet_writer writer) {
static void OnSubmitCB_dummy(const void* in_packets, uint64_t count, uint64_t user_que_idx,
void* data, hsa_amd_queue_intercept_packet_writer writer) {
const packet_t* packets_arr = reinterpret_cast<const packet_t*>(in_packets);
// Submitting the original packets if profiling was not enabled
@@ -161,8 +166,10 @@ class InterceptQueue {
::proxy::Tracker::entry_t* entry = roctracer::trace_buffer->GetEntry();
entry->kernel.tid = syscall(__NR_gettid);
entry->kernel.name = kernel_name;
::proxy::Tracker::Enable(roctracer::KERNEL_ENTRY_TYPE, obj->agent_info_->dev_id, completion_signal, entry);
const_cast<hsa_kernel_dispatch_packet_t*>(dispatch_packet)->completion_signal = entry->signal;
::proxy::Tracker::Enable(roctracer::KERNEL_ENTRY_TYPE, obj->agent_info_->dev_id,
completion_signal, entry);
const_cast<hsa_kernel_dispatch_packet_t*>(dispatch_packet)->completion_signal =
entry->signal;
}
}
@@ -190,7 +197,7 @@ class InterceptQueue {
static void Enable(bool val) { is_enabled = val; }
private:
static void queue_event_callback(hsa_status_t status, hsa_queue_t *queue, void *arg) {
static void queue_event_callback(hsa_status_t status, hsa_queue_t* queue, void* arg) {
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "queue error handling is not supported");
InterceptQueue* obj = GetObj(queue);
if (obj->queue_event_callback_) obj->queue_event_callback_(status, obj->queue_, arg);
@@ -201,7 +208,8 @@ class InterceptQueue {
return static_cast<hsa_packet_type_t>((*header >> HSA_PACKET_HEADER_TYPE) & header_type_mask);
}
static const amd_kernel_code_t* GetKernelCode(const hsa_kernel_dispatch_packet_t* dispatch_packet) {
static const amd_kernel_code_t* GetKernelCode(
const hsa_kernel_dispatch_packet_t* dispatch_packet) {
const amd_kernel_code_t* kernel_code = NULL;
hsa_status_t status =
util::HsaRsrcFactory::Instance().LoaderApi()->hsa_ven_amd_loader_query_host_address(
@@ -256,22 +264,19 @@ class InterceptQueue {
assert(queue == obj->queue_);
delete obj;
obj_map_->erase(it);
status = HSA_STATUS_SUCCESS;;
status = HSA_STATUS_SUCCESS;
;
}
return status;
}
InterceptQueue(const hsa_agent_t& agent, hsa_queue_t* const queue, ProxyQueue* proxy) :
queue_(queue),
proxy_(proxy)
{
InterceptQueue(const hsa_agent_t& agent, hsa_queue_t* const queue, ProxyQueue* proxy)
: queue_(queue), proxy_(proxy) {
agent_info_ = util::HsaRsrcFactory::Instance().GetAgentInfo(agent);
queue_event_callback_ = NULL;
}
~InterceptQueue() {
ProxyQueue::Destroy(proxy_);
}
~InterceptQueue() { ProxyQueue::Destroy(proxy_); }
static bool is_enabled;
+2 -2
Просмотреть файл
@@ -36,10 +36,10 @@ ProxyQueue* ProxyQueue::Create(hsa_agent_t agent, uint32_t size, hsa_queue_type3
hsa_status_t* status) {
hsa_status_t suc = HSA_STATUS_ERROR;
ProxyQueue* instance =
(rocp_type_) ? (ProxyQueue*) new SimpleProxyQueue() : (ProxyQueue*) new HsaProxyQueue();
(rocp_type_) ? (ProxyQueue*)new SimpleProxyQueue() : (ProxyQueue*)new HsaProxyQueue();
if (instance != NULL) {
suc = instance->Init(agent, size, type, callback, data, private_segment_size,
group_segment_size, queue);
group_segment_size, queue);
if (suc != HSA_STATUS_SUCCESS) {
delete instance;
instance = NULL;
+12 -6
Просмотреть файл
@@ -25,13 +25,19 @@ void SimpleProxyQueue::HsaIntercept(HsaApiTable* table) {
table->core_->hsa_signal_store_relaxed_fn = rocprofiler::SimpleProxyQueue::SignalStore;
table->core_->hsa_signal_store_screlease_fn = rocprofiler::SimpleProxyQueue::SignalStore;
table->core_->hsa_queue_load_write_index_relaxed_fn = rocprofiler::SimpleProxyQueue::GetQueueIndex;
table->core_->hsa_queue_store_write_index_relaxed_fn = rocprofiler::SimpleProxyQueue::SetQueueIndex;
table->core_->hsa_queue_load_read_index_relaxed_fn = rocprofiler::SimpleProxyQueue::GetSubmitIndex;
table->core_->hsa_queue_load_write_index_relaxed_fn =
rocprofiler::SimpleProxyQueue::GetQueueIndex;
table->core_->hsa_queue_store_write_index_relaxed_fn =
rocprofiler::SimpleProxyQueue::SetQueueIndex;
table->core_->hsa_queue_load_read_index_relaxed_fn =
rocprofiler::SimpleProxyQueue::GetSubmitIndex;
table->core_->hsa_queue_load_write_index_scacquire_fn = rocprofiler::SimpleProxyQueue::GetQueueIndex;
table->core_->hsa_queue_store_write_index_screlease_fn = rocprofiler::SimpleProxyQueue::SetQueueIndex;
table->core_->hsa_queue_load_read_index_scacquire_fn = rocprofiler::SimpleProxyQueue::GetSubmitIndex;
table->core_->hsa_queue_load_write_index_scacquire_fn =
rocprofiler::SimpleProxyQueue::GetQueueIndex;
table->core_->hsa_queue_store_write_index_screlease_fn =
rocprofiler::SimpleProxyQueue::SetQueueIndex;
table->core_->hsa_queue_load_read_index_scacquire_fn =
rocprofiler::SimpleProxyQueue::GetSubmitIndex;
}
SimpleProxyQueue::queue_map_t* SimpleProxyQueue::queue_map_ = NULL;
+6 -6
Просмотреть файл
@@ -30,7 +30,7 @@
#include "util/hsa_rsrc_factory.h"
#ifndef ROCP_PROXY_LOCK
# define ROCP_PROXY_LOCK 1
#define ROCP_PROXY_LOCK 1
#endif
namespace rocprofiler {
@@ -125,7 +125,8 @@ class SimpleProxyQueue : public ProxyQueue {
const uint64_t que_idx = hsa_queue_load_write_index_relaxed_fn(queue_);
// Waiting untill there is a free space in the queue
while (que_idx >= (hsa_queue_load_read_index_relaxed_fn(queue_) + size_));
while (que_idx >= (hsa_queue_load_read_index_relaxed_fn(queue_) + size_))
;
// Increment the write index
hsa_queue_store_write_index_relaxed_fn(queue_, que_idx + 1);
@@ -160,8 +161,7 @@ class SimpleProxyQueue : public ProxyQueue {
queue_mask_(0),
submit_index_(0),
on_submit_cb_(NULL),
on_submit_cb_data_(NULL)
{
on_submit_cb_data_(NULL) {
printf("ROCProfiler: SimpleProxyQueue is enabled\n");
fflush(stdout);
}
@@ -200,8 +200,8 @@ class SimpleProxyQueue : public ProxyQueue {
if (queue_map_ == NULL) queue_map_ = new queue_map_t;
(*queue_map_)[queue_->doorbell_signal.handle] = this;
}
else abort();
} else
abort();
}
}
if (status != HSA_STATUS_SUCCESS) abort();
+17 -11
Просмотреть файл
@@ -35,20 +35,21 @@
namespace proxy {
class Tracker {
public:
public:
typedef util::HsaRsrcFactory::timestamp_t timestamp_t;
typedef roctracer::trace_entry_t entry_t;
typedef roctracer::entry_type_t entry_type_t;
// Add tracker entry
inline static void Enable(entry_type_t type, const hsa_agent_t& agent, const hsa_signal_t& signal, entry_t* entry) {
inline static void Enable(entry_type_t type, const hsa_agent_t& agent, const hsa_signal_t& signal,
entry_t* entry) {
hsa_status_t status = HSA_STATUS_ERROR;
util::HsaRsrcFactory* hsa_rsrc = &(util::HsaRsrcFactory::Instance());
// Creating a new tracker entry
entry->type = type;
entry->agent = agent;
entry->dev_index = 0; //hsa_rsrc->GetAgentInfo(agent)->dev_index;
entry->dev_index = 0; // hsa_rsrc->GetAgentInfo(agent)->dev_index;
entry->orig = signal;
entry->dispatch = hsa_rsrc->TimestampNs();
entry->valid.store(roctracer::TRACE_ENTRY_INIT, std::memory_order_release);
@@ -56,7 +57,8 @@ class Tracker {
// Creating a proxy signal
status = hsa_signal_create(1, 0, NULL, &(entry->signal));
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create");
status = hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry);
status =
hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler");
}
@@ -66,7 +68,7 @@ class Tracker {
entry->valid.store(roctracer::TRACE_ENTRY_INV, std::memory_order_release);
}
private:
private:
// Entry completion
inline static void Complete(hsa_signal_value_t signal_value, entry_t* entry) {
// Query begin/end and complete timestamps
@@ -74,12 +76,14 @@ class Tracker {
if (entry->type == roctracer::COPY_ENTRY_TYPE) {
hsa_amd_profiling_async_copy_time_t async_copy_time{};
hsa_status_t status = hsa_amd_profiling_get_async_copy_time(entry->signal, &async_copy_time);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_profiling_get_async_copy_time");
if (status != HSA_STATUS_SUCCESS)
EXC_RAISING(status, "hsa_amd_profiling_get_async_copy_time");
entry->begin = hsa_rsrc->SysclockToNs(async_copy_time.start);
entry->end = hsa_rsrc->SysclockToNs(async_copy_time.end);
} else {
hsa_amd_profiling_dispatch_time_t dispatch_time{};
hsa_status_t status = hsa_amd_profiling_get_dispatch_time(entry->agent, entry->signal, &dispatch_time);
hsa_status_t status =
hsa_amd_profiling_get_dispatch_time(entry->agent, entry->signal, &dispatch_time);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_profiling_get_dispatch_time");
entry->begin = hsa_rsrc->SysclockToNs(dispatch_time.start);
entry->end = hsa_rsrc->SysclockToNs(dispatch_time.end);
@@ -101,7 +105,8 @@ class Tracker {
orig_signal_ptr->end_ts = prof_signal_ptr->end_ts;
const hsa_signal_value_t new_value = hsa_signal_load_relaxed(orig) - 1;
if (signal_value != new_value) EXC_ABORT(HSA_STATUS_ERROR, "Tracker::Complete bad signal value");
if (signal_value != new_value)
EXC_ABORT(HSA_STATUS_ERROR, "Tracker::Complete bad signal value");
hsa_signal_store_screlease(orig, signal_value);
}
hsa_signal_destroy(signal);
@@ -111,7 +116,8 @@ class Tracker {
static bool Handler(hsa_signal_value_t signal_value, void* arg) {
// Acquire entry
entry_t* entry = reinterpret_cast<entry_t*>(arg);
while (entry->valid.load(std::memory_order_acquire) != roctracer::TRACE_ENTRY_INIT) sched_yield();
while (entry->valid.load(std::memory_order_acquire) != roctracer::TRACE_ENTRY_INIT)
sched_yield();
// Complete entry
Tracker::Complete(signal_value, entry);
@@ -119,6 +125,6 @@ class Tracker {
}
};
} // namespace rocprofiler
} // namespace proxy
#endif // SRC_PROXY_TRACKER_H_
#endif // SRC_PROXY_TRACKER_H_
+4 -3
Просмотреть файл
@@ -34,11 +34,12 @@ typedef hsa_ext_amd_aql_pm4_packet_t packet_t;
typedef uint32_t packet_word_t;
typedef uint64_t timestamp_t;
inline std::ostream& operator<< (std::ostream& out, const event_t& event) {
out << "[block_name(" << event.block_name << "). block_index(" << event.block_index << "). counter_id(" << event.counter_id << ")]";
inline std::ostream& operator<<(std::ostream& out, const event_t& event) {
out << "[block_name(" << event.block_name << "). block_index(" << event.block_index
<< "). counter_id(" << event.counter_id << ")]";
return out;
}
inline std::ostream& operator<< (std::ostream& out, const parameter_t& parameter) {
inline std::ostream& operator<<(std::ostream& out, const parameter_t& parameter) {
out << "[parameter_name(" << parameter.parameter_name << "). value(" << parameter.value << ")]";
return out;
}
+22 -12
Просмотреть файл
@@ -37,7 +37,6 @@
#define API_METHOD_PREFIX \
roctx_status_t err = ROCTX_STATUS_SUCCESS; \
try {
#define API_METHOD_SUFFIX \
} \
catch (std::exception & e) { \
@@ -52,7 +51,7 @@
ERR_LOGGING(__FUNCTION__ << "(), " << e.what()); \
err = roctx::GetExcStatus(e); \
} \
(void)err; \
(void)err;
#define API_METHOD_CATCH(X) \
} \
@@ -84,8 +83,10 @@ thread_map_t thread_map;
static thread_local message_stack_t* message_stack = NULL;
roctx_status_t GetExcStatus(const std::exception& e) {
const roctracer::util::exception* roctx_exc_ptr = dynamic_cast<const roctracer::util::exception*>(&e);
return (roctx_exc_ptr) ? static_cast<roctx_status_t>(roctx_exc_ptr->status()) : ROCTX_STATUS_ERROR;
const roctracer::util::exception* roctx_exc_ptr =
dynamic_cast<const roctracer::util::exception*>(&e);
return (roctx_exc_ptr) ? static_cast<roctx_status_t>(roctx_exc_ptr->status())
: ROCTX_STATUS_ERROR;
}
void thread_data_init() {
@@ -124,7 +125,8 @@ PUBLIC_API void roctxMarkA(const char* message) {
activity_rtapi_callback_t api_callback_fun = NULL;
void* api_callback_arg = NULL;
roctx::cb_table.get(ROCTX_API_ID_roctxMarkA, &api_callback_fun, &api_callback_arg);
if (api_callback_fun) api_callback_fun(ACTIVITY_DOMAIN_ROCTX, ROCTX_API_ID_roctxMarkA, &api_data, api_callback_arg);
if (api_callback_fun)
api_callback_fun(ACTIVITY_DOMAIN_ROCTX, ROCTX_API_ID_roctxMarkA, &api_data, api_callback_arg);
API_METHOD_SUFFIX_NRET
}
@@ -137,7 +139,9 @@ PUBLIC_API int roctxRangePushA(const char* message) {
activity_rtapi_callback_t api_callback_fun = NULL;
void* api_callback_arg = NULL;
roctx::cb_table.get(ROCTX_API_ID_roctxRangePushA, &api_callback_fun, &api_callback_arg);
if (api_callback_fun) api_callback_fun(ACTIVITY_DOMAIN_ROCTX, ROCTX_API_ID_roctxRangePushA, &api_data, api_callback_arg);
if (api_callback_fun)
api_callback_fun(ACTIVITY_DOMAIN_ROCTX, ROCTX_API_ID_roctxRangePushA, &api_data,
api_callback_arg);
roctx::message_stack->push(strdup(message));
return roctx::message_stack->size() - 1;
@@ -152,11 +156,13 @@ PUBLIC_API int roctxRangePop() {
activity_rtapi_callback_t api_callback_fun = NULL;
void* api_callback_arg = NULL;
roctx::cb_table.get(ROCTX_API_ID_roctxRangePop, &api_callback_fun, &api_callback_arg);
if (api_callback_fun) api_callback_fun(ACTIVITY_DOMAIN_ROCTX, ROCTX_API_ID_roctxRangePop, &api_data, api_callback_arg);
if (api_callback_fun)
api_callback_fun(ACTIVITY_DOMAIN_ROCTX, ROCTX_API_ID_roctxRangePop, &api_data,
api_callback_arg);
if (roctx::message_stack->empty()) {
EXC_ABORT(ROCTX_STATUS_ERROR, "Pop from empty stack!");
EXC_ABORT(ROCTX_STATUS_ERROR, "Pop from empty stack!");
} else {
roctx::message_stack->pop();
roctx::message_stack->pop();
}
return roctx::message_stack->size();
@@ -173,7 +179,9 @@ PUBLIC_API roctx_range_id_t roctxRangeStartA(const char* message) {
activity_rtapi_callback_t api_callback_fun = NULL;
void* api_callback_arg = NULL;
roctx::cb_table.get(ROCTX_API_ID_roctxRangeStartA, &api_callback_fun, &api_callback_arg);
if (api_callback_fun) api_callback_fun(ACTIVITY_DOMAIN_ROCTX, ROCTX_API_ID_roctxRangeStartA, &api_data, api_callback_arg);
if (api_callback_fun)
api_callback_fun(ACTIVITY_DOMAIN_ROCTX, ROCTX_API_ID_roctxRangeStartA, &api_data,
api_callback_arg);
return roctx_range_counter;
API_METHOD_CATCH(-1);
@@ -186,14 +194,16 @@ PUBLIC_API void roctxRangeStop(roctx_range_id_t rangeId) {
activity_rtapi_callback_t api_callback_fun = NULL;
void* api_callback_arg = NULL;
roctx::cb_table.get(ROCTX_API_ID_roctxRangeStop, &api_callback_fun, &api_callback_arg);
if (api_callback_fun) api_callback_fun(ACTIVITY_DOMAIN_ROCTX, ROCTX_API_ID_roctxRangeStop, &api_data, api_callback_arg);
if (api_callback_fun)
api_callback_fun(ACTIVITY_DOMAIN_ROCTX, ROCTX_API_ID_roctxRangeStop, &api_data,
api_callback_arg);
API_METHOD_SUFFIX_NRET
}
PUBLIC_API void RangeStackIterate(roctx_range_iterate_cb_t callback, void* arg) {
for (const auto& entry : roctx::thread_map) {
const auto tid = entry.first;
for (roctx::message_stack_t stack = *(entry.second); !stack.empty(); stack.pop()){
for (roctx::message_stack_t stack = *(entry.second); !stack.empty(); stack.pop()) {
std::string message = stack.top();
roctx_range_data_t data{};
data.message = message.c_str();
+1 -3
Просмотреть файл
@@ -45,8 +45,6 @@ PUBLIC_API bool RegisterApiCallback(uint32_t op, void* callback, void* arg) {
return roctx::cb_table.set(op, reinterpret_cast<activity_rtapi_callback_t>(callback), arg);
}
PUBLIC_API bool RemoveApiCallback(uint32_t op) {
return roctx::cb_table.set(op, NULL, NULL);
}
PUBLIC_API bool RemoveApiCallback(uint32_t op) { return roctx::cb_table.set(op, NULL, NULL); }
} // extern "C"
+2 -2
Просмотреть файл
@@ -43,12 +43,12 @@
#define HCC_EXC_RAISING(error, stream) \
do { \
EXC_RAISING(error, "HCC error: " << stream); \
} while(0)
} while (0)
#define HIP_EXC_RAISING(error, stream) \
do { \
EXC_RAISING(error, "HIP error: " << stream); \
} while(0)
} while (0)
namespace roctracer {
namespace util {
+101 -58
Просмотреть файл
@@ -69,13 +69,15 @@ static hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t pool, void* data, bool
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
err = HsaRsrcFactory::HsaApi()->hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
err = HsaRsrcFactory::HsaApi()->hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
CHECK_STATUS("hsa_amd_memory_pool_get_info", err);
if (HSA_AMD_SEGMENT_GLOBAL != segment) {
return HSA_STATUS_SUCCESS;
}
err = HsaRsrcFactory::HsaApi()->hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag);
err = HsaRsrcFactory::HsaApi()->hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag);
CHECK_STATUS("hsa_amd_memory_pool_get_info", err);
uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT;
@@ -128,19 +130,22 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize
#ifdef ROCP_LD_AQLPROFILE
status = LoadAqlProfileLib(&aqlprofile_api_);
#else
status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, hsa_ven_amd_aqlprofile_VERSION_MAJOR, sizeof(aqlprofile_api_), &aqlprofile_api_);
status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_AQLPROFILE,
hsa_ven_amd_aqlprofile_VERSION_MAJOR,
sizeof(aqlprofile_api_), &aqlprofile_api_);
#endif
CHECK_STATUS("aqlprofile API table load failed", status);
// Get Loader API table
loader_api_ = {0};
status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1, sizeof(loader_api_), &loader_api_);
status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1,
sizeof(loader_api_), &loader_api_);
CHECK_STATUS("loader API table query failed", status);
// Instantiate HSA timer
timer_ = new HsaTimer(&hsa_api_);
CHECK_STATUS("HSA timer allocation failed",
(timer_ == NULL) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS);
(timer_ == NULL) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS);
// Time correlation
const uint32_t corr_iters = 1000;
@@ -148,7 +153,8 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize
CorrelateTime(HsaTimer::TIME_ID_CLOCK_MONOTONIC, corr_iters);
// System timeout
timeout_ = (timeout_ns_ == HsaTimer::TIMESTAMP_MAX) ? timeout_ns_ : timer_->ns_to_sysclock(timeout_ns_);
timeout_ =
(timeout_ns_ == HsaTimer::TIMESTAMP_MAX) ? timeout_ns_ : timer_->ns_to_sysclock(timeout_ns_);
}
// Destructor of the class
@@ -174,9 +180,12 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) {
hsa_api_.hsa_queue_create = table->core_->hsa_queue_create_fn;
hsa_api_.hsa_queue_destroy = table->core_->hsa_queue_destroy_fn;
hsa_api_.hsa_queue_load_write_index_relaxed = table->core_->hsa_queue_load_write_index_relaxed_fn;
hsa_api_.hsa_queue_store_write_index_relaxed = table->core_->hsa_queue_store_write_index_relaxed_fn;
hsa_api_.hsa_queue_load_read_index_relaxed = table->core_->hsa_queue_load_read_index_relaxed_fn;
hsa_api_.hsa_queue_load_write_index_relaxed =
table->core_->hsa_queue_load_write_index_relaxed_fn;
hsa_api_.hsa_queue_store_write_index_relaxed =
table->core_->hsa_queue_store_write_index_relaxed_fn;
hsa_api_.hsa_queue_load_read_index_relaxed =
table->core_->hsa_queue_load_read_index_relaxed_fn;
hsa_api_.hsa_signal_create = table->core_->hsa_signal_create_fn;
hsa_api_.hsa_signal_destroy = table->core_->hsa_signal_destroy_fn;
@@ -185,18 +194,22 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) {
hsa_api_.hsa_signal_wait_scacquire = table->core_->hsa_signal_wait_scacquire_fn;
hsa_api_.hsa_signal_store_screlease = table->core_->hsa_signal_store_screlease_fn;
hsa_api_.hsa_code_object_reader_create_from_file = table->core_->hsa_code_object_reader_create_from_file_fn;
hsa_api_.hsa_code_object_reader_create_from_file =
table->core_->hsa_code_object_reader_create_from_file_fn;
hsa_api_.hsa_executable_create_alt = table->core_->hsa_executable_create_alt_fn;
hsa_api_.hsa_executable_load_agent_code_object = table->core_->hsa_executable_load_agent_code_object_fn;
hsa_api_.hsa_executable_load_agent_code_object =
table->core_->hsa_executable_load_agent_code_object_fn;
hsa_api_.hsa_executable_freeze = table->core_->hsa_executable_freeze_fn;
hsa_api_.hsa_executable_get_symbol = table->core_->hsa_executable_get_symbol_fn;
hsa_api_.hsa_executable_symbol_get_info = table->core_->hsa_executable_symbol_get_info_fn;
hsa_api_.hsa_executable_iterate_symbols = table->core_->hsa_executable_iterate_symbols_fn;
hsa_api_.hsa_system_get_info = table->core_->hsa_system_get_info_fn;
hsa_api_.hsa_system_get_major_extension_table = table->core_->hsa_system_get_major_extension_table_fn;
hsa_api_.hsa_system_get_major_extension_table =
table->core_->hsa_system_get_major_extension_table_fn;
hsa_api_.hsa_amd_agent_iterate_memory_pools = table->amd_ext_->hsa_amd_agent_iterate_memory_pools_fn;
hsa_api_.hsa_amd_agent_iterate_memory_pools =
table->amd_ext_->hsa_amd_agent_iterate_memory_pools_fn;
hsa_api_.hsa_amd_memory_pool_get_info = table->amd_ext_->hsa_amd_memory_pool_get_info_fn;
hsa_api_.hsa_amd_memory_pool_allocate = table->amd_ext_->hsa_amd_memory_pool_allocate_fn;
hsa_api_.hsa_amd_agents_allow_access = table->amd_ext_->hsa_amd_agents_allow_access_fn;
@@ -204,9 +217,12 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) {
hsa_api_.hsa_amd_memory_async_copy_rect = table->amd_ext_->hsa_amd_memory_async_copy_rect_fn;
hsa_api_.hsa_amd_signal_async_handler = table->amd_ext_->hsa_amd_signal_async_handler_fn;
hsa_api_.hsa_amd_profiling_set_profiler_enabled = table->amd_ext_->hsa_amd_profiling_set_profiler_enabled_fn;
hsa_api_.hsa_amd_profiling_get_async_copy_time = table->amd_ext_->hsa_amd_profiling_get_async_copy_time_fn;
hsa_api_.hsa_amd_profiling_get_dispatch_time = table->amd_ext_->hsa_amd_profiling_get_dispatch_time_fn;
hsa_api_.hsa_amd_profiling_set_profiler_enabled =
table->amd_ext_->hsa_amd_profiling_set_profiler_enabled_fn;
hsa_api_.hsa_amd_profiling_get_async_copy_time =
table->amd_ext_->hsa_amd_profiling_get_async_copy_time_fn;
hsa_api_.hsa_amd_profiling_get_dispatch_time =
table->amd_ext_->hsa_amd_profiling_get_dispatch_time_fn;
} else {
hsa_api_.hsa_init = hsa_init;
hsa_api_.hsa_shut_down = hsa_shut_down;
@@ -302,10 +318,13 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) {
agent_info->dev_type = HSA_DEVICE_TYPE_CPU;
agent_info->dev_index = cpu_list_.size();
status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool);
status =
hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool);
if ((status == HSA_STATUS_INFO_BREAK) && (cpu_pool_ == NULL)) cpu_pool_ = &agent_info->cpu_pool;
status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, &agent_info->kern_arg_pool);
if ((status == HSA_STATUS_INFO_BREAK) && (kern_arg_pool_ == NULL)) kern_arg_pool_ = &agent_info->kern_arg_pool;
status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool,
&agent_info->kern_arg_pool);
if ((status == HSA_STATUS_INFO_BREAK) && (kern_arg_pool_ == NULL))
kern_arg_pool_ = &agent_info->kern_arg_pool;
agent_info->gpu_pool = {};
cpu_list_.push_back(agent_info);
@@ -317,28 +336,34 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) {
agent_info->dev_id = agent;
agent_info->dev_type = HSA_DEVICE_TYPE_GPU;
hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name);
const int gfxip_label_len = std::min (strlen(agent_info->name) - 2, sizeof (agent_info->gfxip) - 1);
const int gfxip_label_len =
std::min(strlen(agent_info->name) - 2, sizeof(agent_info->gfxip) - 1);
memcpy(agent_info->gfxip, agent_info->name, gfxip_label_len);
agent_info->gfxip[gfxip_label_len] = '\0';
hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size);
hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size);
hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile);
agent_info->is_apu = (agent_info->profile == HSA_PROFILE_FULL) ? true : false;
hsa_api_.hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT),
&agent_info->cu_num);
hsa_api_.hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU),
&agent_info->waves_per_cu);
hsa_api_.hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU),
&agent_info->simds_per_cu);
hsa_api_.hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES),
&agent_info->se_num);
hsa_api_.hsa_agent_get_info(
agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT),
&agent_info->cu_num);
hsa_api_.hsa_agent_get_info(agent,
static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE),
&agent_info->shader_arrays_per_se);
static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU),
&agent_info->waves_per_cu);
hsa_api_.hsa_agent_get_info(agent,
static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU),
&agent_info->simds_per_cu);
hsa_api_.hsa_agent_get_info(
agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES),
&agent_info->se_num);
hsa_api_.hsa_agent_get_info(
agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE),
&agent_info->shader_arrays_per_se);
agent_info->cpu_pool = {};
agent_info->kern_arg_pool = {};
status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool);
status =
hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool);
CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(gpu pool)", status);
// GFX8 and GFX9 SGPR/VGPR block sizes
@@ -435,7 +460,7 @@ bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts,
hsa_queue_t** queue) {
hsa_status_t status;
status = hsa_api_.hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL,
UINT32_MAX, UINT32_MAX, queue);
UINT32_MAX, UINT32_MAX, queue);
return (status == HSA_STATUS_SUCCESS);
}
@@ -458,7 +483,8 @@ uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t
hsa_status_t status = HSA_STATUS_ERROR;
uint8_t* buffer = NULL;
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
status = hsa_api_.hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, 0, reinterpret_cast<void**>(&buffer));
status = hsa_api_.hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, 0,
reinterpret_cast<void**>(&buffer));
uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL;
return ptr;
}
@@ -473,7 +499,8 @@ uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size
uint8_t* buffer = NULL;
if (!cpu_agents_.empty()) {
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
status = hsa_api_.hsa_amd_memory_pool_allocate(*kern_arg_pool_, size, 0, reinterpret_cast<void**>(&buffer));
status = hsa_api_.hsa_amd_memory_pool_allocate(*kern_arg_pool_, size, 0,
reinterpret_cast<void**>(&buffer));
// Both the CPU and GPU can access the kernel arguments
if (status == HSA_STATUS_SUCCESS) {
hsa_agent_t ag_list[1] = {agent_info->dev_id};
@@ -493,7 +520,8 @@ uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t s
uint8_t* buffer = NULL;
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
if (!cpu_agents_.empty()) {
status = hsa_api_.hsa_amd_memory_pool_allocate(*cpu_pool_, size, 0, reinterpret_cast<void**>(&buffer));
status = hsa_api_.hsa_amd_memory_pool_allocate(*cpu_pool_, size, 0,
reinterpret_cast<void**>(&buffer));
// Both the CPU and GPU can access the memory
if (status == HSA_STATUS_SUCCESS) {
hsa_agent_t ag_list[1] = {agent_info->dev_id};
@@ -518,16 +546,18 @@ uint8_t* HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t s
}
// Wait signal
hsa_signal_value_t HsaRsrcFactory::SignalWait(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const {
hsa_signal_value_t HsaRsrcFactory::SignalWait(const hsa_signal_t& signal,
const hsa_signal_value_t& signal_value) const {
const hsa_signal_value_t exp_value = signal_value - 1;
hsa_signal_value_t ret_value = signal_value;
while (1) {
ret_value =
hsa_api_.hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, signal_value, timeout_, HSA_WAIT_STATE_BLOCKED);
ret_value = hsa_api_.hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, signal_value,
timeout_, HSA_WAIT_STATE_BLOCKED);
if (ret_value == exp_value) break;
if (ret_value != signal_value) {
std::cerr << "Error: HsaRsrcFactory::SignalWait: signal_value(" << signal_value
<< "), ret_value(" << ret_value << ")" << std::endl << std::flush;
<< "), ret_value(" << ret_value << ")" << std::endl
<< std::flush;
abort();
}
}
@@ -535,7 +565,8 @@ hsa_signal_value_t HsaRsrcFactory::SignalWait(const hsa_signal_t& signal, const
}
// Wait signal with signal value restore
void HsaRsrcFactory::SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const {
void HsaRsrcFactory::SignalWaitRestore(const hsa_signal_t& signal,
const hsa_signal_value_t& signal_value) const {
SignalWait(signal, signal_value);
hsa_api_.hsa_signal_store_relaxed(const_cast<hsa_signal_t&>(signal), signal_value);
}
@@ -599,13 +630,13 @@ bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* br
}
// Create executable.
status = hsa_api_.hsa_executable_create_alt(HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
NULL, executable);
status = hsa_api_.hsa_executable_create_alt(
HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, executable);
CHECK_STATUS("Error in creating executable object", status);
// Load code object.
status = hsa_api_.hsa_executable_load_agent_code_object(*executable, agent_info->dev_id, code_obj_rdr,
NULL, NULL);
status = hsa_api_.hsa_executable_load_agent_code_object(*executable, agent_info->dev_id,
code_obj_rdr, NULL, NULL);
CHECK_STATUS("Error in loading executable object", status);
// Freeze executable.
@@ -615,7 +646,7 @@ bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* br
// Get symbol handle.
hsa_executable_symbol_t kernelSymbol;
status = hsa_api_.hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0,
&kernelSymbol);
&kernelSymbol);
CHECK_STATUS("Error in looking up kernel symbol", status);
// Update output parameter
@@ -659,7 +690,8 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) {
}
uint32_t slot_idx = (uint32_t)(write_idx % queue->size);
uint32_t* queue_slot = reinterpret_cast<uint32_t*>((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b));
uint32_t* queue_slot =
reinterpret_cast<uint32_t*>((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b));
const uint32_t* slot_data = reinterpret_cast<const uint32_t*>(packet);
// Copy buffered commands into the queue slot.
@@ -709,18 +741,22 @@ void HsaRsrcFactory::EnableExecutableTracking(HsaApiTable* table) {
table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor;
}
hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data) {
hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec,
hsa_executable_symbol_t symbol, void* data) {
hsa_symbol_kind_t value = (hsa_symbol_kind_t)0;
hsa_status_t status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &value);
hsa_status_t status =
hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &value);
CHECK_STATUS("Error in getting symbol info", status);
if (value == HSA_SYMBOL_KIND_KERNEL) {
uint64_t addr = 0;
uint32_t len = 0;
status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &addr);
status = hsa_api_.hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &addr);
CHECK_STATUS("Error in getting kernel object", status);
status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &len);
status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH,
&len);
CHECK_STATUS("Error in getting name len", status);
char *name = new char[len + 1];
char* name = new char[len + 1];
status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
CHECK_STATUS("Error in getting kernel name", status);
name[len] = 0;
@@ -733,12 +769,15 @@ hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec, hsa_ex
return HSA_STATUS_SUCCESS;
}
hsa_status_t HsaRsrcFactory::hsa_executable_freeze_interceptor(hsa_executable_t executable, const char *options) {
hsa_status_t HsaRsrcFactory::hsa_executable_freeze_interceptor(hsa_executable_t executable,
const char* options) {
std::lock_guard<mutex_t> lck(mutex_);
if (symbols_map_ == NULL) symbols_map_ = new symbols_map_t;
hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, NULL);
hsa_status_t status =
hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, NULL);
CHECK_STATUS("Error in iterating executable symbols", status);
return hsa_api_.hsa_executable_freeze(executable, options);;
return hsa_api_.hsa_executable_freeze(executable, options);
;
}
void HsaRsrcFactory::DumpHandles(FILE* file) {
@@ -746,10 +785,14 @@ void HsaRsrcFactory::DumpHandles(FILE* file) {
auto end = agent_map_.end();
for (auto it = beg; it != end; ++it) {
const AgentInfo* agent_info = it->second;
fprintf(file, "0x%lx agent %s\n", agent_info->dev_id.handle, (agent_info->dev_type == HSA_DEVICE_TYPE_CPU) ? "cpu" : "gpu");
if (agent_info->cpu_pool.handle != 0) fprintf(file, "0x%lx pool cpu\n", agent_info->cpu_pool.handle);
if (agent_info->kern_arg_pool.handle != 0) fprintf(file, "0x%lx pool cpu kernarg\n", agent_info->kern_arg_pool.handle);
if (agent_info->gpu_pool.handle != 0) fprintf(file, "0x%lx pool gpu%u\n", agent_info->gpu_pool.handle, agent_info->dev_index);
fprintf(file, "0x%lx agent %s\n", agent_info->dev_id.handle,
(agent_info->dev_type == HSA_DEVICE_TYPE_CPU) ? "cpu" : "gpu");
if (agent_info->cpu_pool.handle != 0)
fprintf(file, "0x%lx pool cpu\n", agent_info->cpu_pool.handle);
if (agent_info->kern_arg_pool.handle != 0)
fprintf(file, "0x%lx pool cpu kernarg\n", agent_info->kern_arg_pool.handle);
if (agent_info->gpu_pool.handle != 0)
fprintf(file, "0x%lx pool gpu%u\n", agent_info->gpu_pool.handle, agent_info->dev_index);
}
fflush(file);
}
+34 -30
Просмотреть файл
@@ -44,23 +44,25 @@
#define HSA_QUEUE_ALIGN_BYTES 64
#define HSA_PACKET_ALIGN_BYTES 64
#define CHECK_STATUS(msg, status) do { \
if ((status) != HSA_STATUS_SUCCESS) { \
const char* emsg = 0; \
hsa_status_string(status, &emsg); \
printf("%s: %s\n", msg, emsg ? emsg : "<unknown error>"); \
abort(); \
} \
} while (0)
#define CHECK_STATUS(msg, status) \
do { \
if ((status) != HSA_STATUS_SUCCESS) { \
const char* emsg = 0; \
hsa_status_string(status, &emsg); \
printf("%s: %s\n", msg, emsg ? emsg : "<unknown error>"); \
abort(); \
} \
} while (0)
#define CHECK_ITER_STATUS(msg, status) do { \
if ((status) != HSA_STATUS_INFO_BREAK) { \
const char* emsg = 0; \
hsa_status_string(status, &emsg); \
printf("%s: %s\n", msg, emsg ? emsg : "<unknown error>"); \
abort(); \
} \
} while (0)
#define CHECK_ITER_STATUS(msg, status) \
do { \
if ((status) != HSA_STATUS_INFO_BREAK) { \
const char* emsg = 0; \
hsa_status_string(status, &emsg); \
printf("%s: %s\n", msg, emsg ? emsg : "<unknown error>"); \
abort(); \
} \
} while (0)
namespace util {
static const size_t MEM_PAGE_BYTES = 0x1000;
@@ -174,15 +176,12 @@ class HsaTimer {
static const timestamp_t TIMESTAMP_MAX = UINT64_MAX;
typedef long double freq_t;
enum time_id_t {
TIME_ID_CLOCK_REALTIME = 0,
TIME_ID_CLOCK_MONOTONIC = 1,
TIME_ID_NUMBER
};
enum time_id_t { TIME_ID_CLOCK_REALTIME = 0, TIME_ID_CLOCK_MONOTONIC = 1, TIME_ID_NUMBER };
HsaTimer(const hsa_pfn_t* hsa_api) : hsa_api_(hsa_api) {
timestamp_t sysclock_hz = 0;
hsa_status_t status = hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz);
hsa_status_t status =
hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz);
CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY)", status);
sysclock_factor_ = (freq_t)1000000000 / (freq_t)sysclock_hz;
}
@@ -217,8 +216,8 @@ class HsaTimer {
// Return pair of correlated values of profiling timestamp and time with
// correlation error for a given time ID and number of iterations
void correlated_pair_ns(time_id_t time_id, uint32_t iters,
timestamp_t* timestamp_v, timestamp_t* time_v, timestamp_t* error_v) const {
void correlated_pair_ns(time_id_t time_id, uint32_t iters, timestamp_t* timestamp_v,
timestamp_t* time_v, timestamp_t* error_v) const {
clockid_t clock_id = 0;
switch (time_id) {
case TIME_ID_CLOCK_REALTIME:
@@ -357,7 +356,8 @@ class HsaRsrcFactory {
uint8_t* AllocateCmdMemory(const AgentInfo* agent_info, size_t size);
// Wait signal
hsa_signal_value_t SignalWait(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const;
hsa_signal_value_t SignalWait(const hsa_signal_t& signal,
const hsa_signal_value_t& signal_value) const;
// Wait signal with signal value restore
void SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const;
@@ -403,7 +403,9 @@ class HsaRsrcFactory {
const hsa_ven_amd_loader_1_00_pfn_t* LoaderApi() const { return &loader_api_; }
// Methods for system-clock/ns conversion and timestamp in 'ns'
timestamp_t SysclockToNs(const timestamp_t& sysclock) const { return timer_->sysclock_to_ns(sysclock); }
timestamp_t SysclockToNs(const timestamp_t& sysclock) const {
return timer_->sysclock_to_ns(sysclock);
}
timestamp_t NsToSysclock(const timestamp_t& time) const { return timer_->ns_to_sysclock(time); }
timestamp_t TimestampNs() const { return timer_->timestamp_ns(); }
@@ -482,8 +484,10 @@ class HsaRsrcFactory {
typedef std::map<uint64_t, const char*> symbols_map_t;
static symbols_map_t* symbols_map_;
static bool executable_tracking_on_;
static hsa_status_t hsa_executable_freeze_interceptor(hsa_executable_t executable, const char *options);
static hsa_status_t executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data);
static hsa_status_t hsa_executable_freeze_interceptor(hsa_executable_t executable,
const char* options);
static hsa_status_t executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol,
void* data);
// HSA runtime API table
static hsa_pfn_t hsa_api_;
@@ -507,8 +511,8 @@ class HsaRsrcFactory {
timestamp_t time_error_[HsaTimer::TIME_ID_NUMBER];
// CPU/kern-arg memory pools
hsa_amd_memory_pool_t *cpu_pool_;
hsa_amd_memory_pool_t *kern_arg_pool_;
hsa_amd_memory_pool_t* cpu_pool_;
hsa_amd_memory_pool_t* kern_arg_pool_;
};
} // namespace util
+20 -20
Просмотреть файл
@@ -168,32 +168,32 @@ class Logger {
} // namespace roctracer
#define ERR_LOGGING(stream) \
do { \
roctracer::util::Logger::Instance() << "error: " << roctracer::util::Logger::begm \
<< stream << roctracer::util::Logger::endl; \
} while(0)
do { \
roctracer::util::Logger::Instance() \
<< "error: " << roctracer::util::Logger::begm << stream << roctracer::util::Logger::endl; \
} while (0)
#define INFO_LOGGING(stream) \
do { \
roctracer::util::Logger::Instance() << "info: " << roctracer::util::Logger::begm << stream \
<< roctracer::util::Logger::endl; \
} while(0)
do { \
roctracer::util::Logger::Instance() \
<< "info: " << roctracer::util::Logger::begm << stream << roctracer::util::Logger::endl; \
} while (0)
#define WARN_LOGGING(stream) \
do { \
std::cerr << "ROCProfiler: " << stream << std::endl; \
roctracer::util::Logger::Instance() << "warning: " << roctracer::util::Logger::begm << stream \
<< roctracer::util::Logger::endl; \
} while(0)
do { \
std::cerr << "ROCProfiler: " << stream << std::endl; \
roctracer::util::Logger::Instance() << "warning: " << roctracer::util::Logger::begm << stream \
<< roctracer::util::Logger::endl; \
} while (0)
#ifdef DEBUG
#define DBG_LOGGING(stream) \
do { \
roctracer::util::Logger::Instance() << roctracer::util::Logger::begm << "debug: \"" \
<< stream << "\"" < < < < \
do { \
roctracer::util::Logger::Instance() \
<< roctracer::util::Logger::begm << "debug: \"" << stream << "\"" < < < < \
" in " << __FUNCTION__ << " at " << __FILE__ << " line " << __LINE__ \
<< roctracer::util::Logger::endl; \
} while(0)
<< roctracer::util::Logger::endl; \
} while (0)
#endif
#if DEBUG_TRACE_ON
@@ -204,8 +204,8 @@ inline static void DEBUG_TRACE(const char* fmt, ...) {
va_list valist;
va_start(valist, fmt);
vsnprintf(buf, size, fmt, valist);
printf("%u:%u %s",
roctracer::util::Logger::GetPid(), roctracer::util::Logger::GetTid(), buf); fflush(stdout);
printf("%u:%u %s", roctracer::util::Logger::GetPid(), roctracer::util::Logger::GetTid(), buf);
fflush(stdout);
va_end(valist);
}
#else
+49 -51
Просмотреть файл
@@ -37,57 +37,56 @@
#define THREADS_PER_BLOCK_Z 1
// Mark API
extern "C"
void roctracer_mark(const char* str);
extern "C" void roctracer_mark(const char* str);
// Device (Kernel) function, it must be void
__global__ void matrixTranspose(float* out, float* in, const int width) {
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
out[y * width + x] = in[x * width + y];
out[y * width + x] = in[x * width + y];
}
// CPU implementation of matrix transpose
void matrixTransposeCPUReference(float* output, float* input, const unsigned int width) {
for (unsigned int j = 0; j < width; j++) {
for (unsigned int i = 0; i < width; i++) {
output[i * width + j] = input[j * width + i];
}
for (unsigned int j = 0; j < width; j++) {
for (unsigned int i = 0; i < width; i++) {
output[i * width + j] = input[j * width + i];
}
}
}
int main() {
float* Matrix;
float* TransposeMatrix;
float* cpuTransposeMatrix;
float* Matrix;
float* TransposeMatrix;
float* cpuTransposeMatrix;
float* gpuMatrix;
float* gpuTransposeMatrix;
float* gpuMatrix;
float* gpuTransposeMatrix;
hipDeviceProp_t devProp;
hipGetDeviceProperties(&devProp, 0);
hipDeviceProp_t devProp;
hipGetDeviceProperties(&devProp, 0);
std::cout << "Device name " << devProp.name << std::endl;
std::cout << "Device name " << devProp.name << std::endl;
int i;
int errors;
int i;
int errors;
Matrix = (float*)malloc(NUM * sizeof(float));
TransposeMatrix = (float*)malloc(NUM * sizeof(float));
cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
Matrix = (float*)malloc(NUM * sizeof(float));
TransposeMatrix = (float*)malloc(NUM * sizeof(float));
cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
// initialize the input data
for (i = 0; i < NUM; i++) {
Matrix[i] = (float)i * 10.0f;
}
// initialize the input data
for (i = 0; i < NUM; i++) {
Matrix[i] = (float)i * 10.0f;
}
// allocate the memory on the device side
hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
// allocate the memory on the device side
hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
uint32_t iterations = 100;
while (iterations-- > 0) {
uint32_t iterations = 100;
while (iterations-- > 0) {
std::cout << "## Iteration (" << iterations << ") #################" << std::endl;
// Memory transfer from host to device
@@ -98,9 +97,9 @@ int main() {
int rangeId = roctxRangeStart("hipLaunchKernel range");
roctxRangePush("hipLaunchKernel");
// Lauching kernel from host
hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
gpuMatrix, WIDTH);
hipLaunchKernelGGL(
matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix, gpuMatrix, WIDTH);
roctracer_mark("after HIP LaunchKernel");
roctxMark("after hipLaunchKernel");
@@ -109,8 +108,8 @@ int main() {
hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost);
roctxRangePop(); // for "hipMemcpy"
roctxRangePop(); // for "hipLaunchKernel"
roctxRangePop(); // for "hipMemcpy"
roctxRangePop(); // for "hipLaunchKernel"
roctxRangeStop(rangeId);
// CPU MatrixTranspose computation
@@ -120,26 +119,25 @@ int main() {
errors = 0;
double eps = 1.0E-6;
for (i = 0; i < NUM; i++) {
if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
errors++;
}
if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
errors++;
}
}
if (errors != 0) {
printf("FAILED: %d errors\n", errors);
printf("FAILED: %d errors\n", errors);
} else {
printf("PASSED!\n");
printf("PASSED!\n");
}
}
}
// free the resources on device side
hipFree(gpuMatrix);
hipFree(gpuTransposeMatrix);
// free the resources on device side
hipFree(gpuMatrix);
hipFree(gpuTransposeMatrix);
// free the resources on host side
free(Matrix);
free(TransposeMatrix);
free(cpuTransposeMatrix);
// free the resources on host side
free(Matrix);
free(TransposeMatrix);
free(cpuTransposeMatrix);
return errors;
return errors;
}
+71 -82
Просмотреть файл
@@ -45,7 +45,7 @@ static char* message = NULL;
#endif
void SPRINT(const char* fmt, ...) {
if (msg_buf == NULL) {
msg_buf = (char*) calloc(msg_size, 1);
msg_buf = (char*)calloc(msg_size, 1);
message = msg_buf;
}
@@ -66,13 +66,18 @@ void SFLUSH() {
// hip header file
#include <hip/hip_runtime.h>
// Macro to call HIP API
#define HIP_CALL(call) do { call; } while(0)
#define HIP_CALL(call) \
do { \
call; \
} while (0)
#else
#define HIP_CALL(call) do {} while(0)
#define HIP_CALL(call) \
do { \
} while (0)
#endif
#ifndef ITERATIONS
# define ITERATIONS 101
#define ITERATIONS 101
#endif
#define WIDTH 1024
#define NUM (WIDTH * WIDTH)
@@ -83,20 +88,20 @@ void SFLUSH() {
#if HIP_TEST
// Device (Kernel) function, it must be void
__global__ void matrixTranspose(float* out, float* in, const int width) {
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
out[y * width + x] = in[x * width + y];
out[y * width + x] = in[x * width + y];
}
#endif
// CPU implementation of matrix transpose
void matrixTransposeCPUReference(float* output, float* input, const unsigned int width) {
for (unsigned int j = 0; j < width; j++) {
for (unsigned int i = 0; i < width; i++) {
output[i * width + j] = input[j * width + i];
}
for (unsigned int j = 0; j < width; j++) {
for (unsigned int i = 0; i < width; i++) {
output[i * width + j] = input[j * width + i];
}
}
}
int iterations = ITERATIONS;
@@ -105,28 +110,28 @@ void start_tracing();
void stop_tracing();
int main() {
float* Matrix;
float* TransposeMatrix;
float* cpuTransposeMatrix;
float* Matrix;
float* TransposeMatrix;
float* cpuTransposeMatrix;
float* gpuMatrix;
float* gpuTransposeMatrix;
float* gpuMatrix;
float* gpuTransposeMatrix;
int i;
int errors;
int i;
int errors;
init_tracing();
init_tracing();
#if HIP_TEST
int gpuCount = 1;
int gpuCount = 1;
#if MGPU_TEST
hipGetDeviceCount(&gpuCount);
printf("Number of GPUs: %d\n", gpuCount);
hipGetDeviceCount(&gpuCount);
printf("Number of GPUs: %d\n", gpuCount);
#endif
iterations *= gpuCount;
iterations *= gpuCount;
#endif
while (iterations-- > 0) {
while (iterations-- > 0) {
start_tracing();
#if HIP_TEST
@@ -145,7 +150,7 @@ int main() {
// initialize the input data
for (i = 0; i < NUM; i++) {
Matrix[i] = (float)i * 10.0f;
Matrix[i] = (float)i * 10.0f;
}
// allocate the memory on the device side
@@ -167,9 +172,10 @@ int main() {
roctxRangePush("hipLaunchKernel");
// Lauching kernel from host
HIP_CALL(hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
gpuMatrix, WIDTH));
HIP_CALL(hipLaunchKernelGGL(matrixTranspose,
dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0,
gpuTransposeMatrix, gpuMatrix, WIDTH));
roctxMark("after hipLaunchKernel");
@@ -179,10 +185,11 @@ int main() {
// Memory transfer from device to host
roctxRangePush("hipMemcpy");
HIP_CALL(hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost));
HIP_CALL(
hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost));
roctxRangePop(); // for "hipMemcpy"
roctxRangePop(); // for "hipLaunchKernel"
roctxRangePop(); // for "hipMemcpy"
roctxRangePop(); // for "hipLaunchKernel"
// correlation reagion end
roctracer_activity_pop_external_correlation_id(NULL);
@@ -194,15 +201,15 @@ int main() {
errors = 0;
double eps = 1.0E-6;
for (i = 0; i < NUM; i++) {
if (abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
errors++;
}
if (abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
errors++;
}
}
if ((HIP_TEST != 0) && (errors != 0)) {
printf("FAILED: %d errors\n", errors);
printf("FAILED: %d errors\n", errors);
} else {
errors = 0;
printf("PASSED!\n");
errors = 0;
printf("PASSED!\n");
}
// free the resources on device side
@@ -218,11 +225,11 @@ int main() {
free(Matrix);
free(TransposeMatrix);
free(cpuTransposeMatrix);
}
}
stop_tracing();
stop_tracing();
return errors;
return errors;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -234,15 +241,15 @@ int main() {
#include <roctracer_hsa.h>
#include <roctracer_roctx.h>
#include <unistd.h>
#include <sys/syscall.h> /* For SYS_xxx definitions */
#include <unistd.h>
#include <sys/syscall.h> /* For SYS_xxx definitions */
// Macro to check ROC-tracer calls status
#define ROCTRACER_CALL(call) \
do { \
int err = call; \
if (err != 0) { \
fprintf(stderr, "%s\n", roctracer_error_string()); \
fprintf(stderr, "%s\n", roctracer_error_string()); \
abort(); \
} \
} while (0)
@@ -252,12 +259,7 @@ static inline uint32_t GetPid() { return syscall(__NR_getpid); }
// Runtime API callback function
void api_callback(
uint32_t domain,
uint32_t cid,
const void* callback_data,
void* arg)
{
void api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg) {
(void)arg;
if (domain == ACTIVITY_DOMAIN_ROCTX) {
@@ -267,31 +269,25 @@ void api_callback(
}
const hip_api_data_t* data = (const hip_api_data_t*)(callback_data);
SPRINT("<%s id(%u)\tcorrelation_id(%lu) %s pid(%d) tid(%d)> ",
roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, cid, 0),
cid,
data->correlation_id,
(data->phase == ACTIVITY_API_PHASE_ENTER) ? "on-enter" : "on-exit", GetPid(), GetTid());
roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, cid, 0), cid, data->correlation_id,
(data->phase == ACTIVITY_API_PHASE_ENTER) ? "on-enter" : "on-exit", GetPid(), GetTid());
if (data->phase == ACTIVITY_API_PHASE_ENTER) {
switch (cid) {
case HIP_API_ID_hipMemcpy:
SPRINT("dst(%p) src(%p) size(0x%x) kind(%u)",
data->args.hipMemcpy.dst,
data->args.hipMemcpy.src,
(uint32_t)(data->args.hipMemcpy.sizeBytes),
(uint32_t)(data->args.hipMemcpy.kind));
SPRINT("dst(%p) src(%p) size(0x%x) kind(%u)", data->args.hipMemcpy.dst,
data->args.hipMemcpy.src, (uint32_t)(data->args.hipMemcpy.sizeBytes),
(uint32_t)(data->args.hipMemcpy.kind));
break;
case HIP_API_ID_hipMalloc:
SPRINT("ptr(%p) size(0x%x)",
data->args.hipMalloc.ptr,
(uint32_t)(data->args.hipMalloc.size));
SPRINT("ptr(%p) size(0x%x)", data->args.hipMalloc.ptr,
(uint32_t)(data->args.hipMalloc.size));
break;
case HIP_API_ID_hipFree:
SPRINT("ptr(%p)", data->args.hipFree.ptr);
break;
case HIP_API_ID_hipModuleLaunchKernel:
SPRINT("kernel(\"%s\") stream(%p)",
hipKernelNameRef(data->args.hipModuleLaunchKernel.f),
data->args.hipModuleLaunchKernel.stream);
SPRINT("kernel(\"%s\") stream(%p)", hipKernelNameRef(data->args.hipModuleLaunchKernel.f),
data->args.hipModuleLaunchKernel.stream);
break;
default:
break;
@@ -316,26 +312,17 @@ void activity_callback(const char* begin, const char* end, void* arg) {
SPRINT("\tActivity records:\n");
while (record < end_record) {
const char * name = roctracer_op_string(record->domain, record->op, record->kind);
SPRINT("\t%s\tcorrelation_id(%lu) time_ns(%lu:%lu)",
name,
record->correlation_id,
record->begin_ns,
record->end_ns);
const char* name = roctracer_op_string(record->domain, record->op, record->kind);
SPRINT("\t%s\tcorrelation_id(%lu) time_ns(%lu:%lu)", name, record->correlation_id,
record->begin_ns, record->end_ns);
if (record->domain == ACTIVITY_DOMAIN_HIP_API) {
SPRINT(" process_id(%u) thread_id(%u)",
record->process_id,
record->thread_id);
SPRINT(" process_id(%u) thread_id(%u)", record->process_id, record->thread_id);
} else if (record->domain == ACTIVITY_DOMAIN_HCC_OPS) {
SPRINT(" device_id(%d) queue_id(%lu)",
record->device_id,
record->queue_id);
SPRINT(" device_id(%d) queue_id(%lu)", record->device_id, record->queue_id);
if (record->op == HIP_OP_ID_COPY) SPRINT(" bytes(0x%zx)", record->bytes);
} else if (record->domain == ACTIVITY_DOMAIN_HSA_OPS) {
SPRINT(" se(%u) cycle(%lu) pc(%lx)",
record->pc_sample.se,
record->pc_sample.cycle,
record->pc_sample.pc);
SPRINT(" se(%u) cycle(%lu) pc(%lx)", record->pc_sample.se, record->pc_sample.cycle,
record->pc_sample.pc);
} else if (record->domain == ACTIVITY_DOMAIN_EXT_API) {
SPRINT(" external_id(%lu)", record->external_id);
} else {
@@ -377,8 +364,10 @@ void init_tracing() {
void start_tracing() {
printf("# START (%d) #############################\n", iterations);
// Start
if ((iterations & 1) == 1) roctracer_start();
else roctracer_stop();
if ((iterations & 1) == 1)
roctracer_start();
else
roctracer_stop();
}
// Stop tracing routine
+10 -12
Просмотреть файл
@@ -43,19 +43,18 @@ void check_status(roctracer_status_t status) {
void codeobj_callback(uint32_t domain, uint32_t cid, const void* data, void* arg) {
const hsa_evt_data_t* evt_data = reinterpret_cast<const hsa_evt_data_t*>(data);
const char* uri = evt_data->codeobj.uri;
printf("codeobj_callback domain(%u) cid(%u): load_base(0x%lx) load_size(0x%lx) load_delta(0x%lx) uri(\"%s\")\n",
domain,
cid,
evt_data->codeobj.load_base,
evt_data->codeobj.load_size,
evt_data->codeobj.load_delta,
uri);
printf(
"codeobj_callback domain(%u) cid(%u): load_base(0x%lx) load_size(0x%lx) load_delta(0x%lx) "
"uri(\"%s\")\n",
domain, cid, evt_data->codeobj.load_base, evt_data->codeobj.load_size,
evt_data->codeobj.load_delta, uri);
free((void*)uri);
fflush(stdout);
}
void initialize() {
roctracer_status_t status = roctracer_enable_op_callback(ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_CODEOBJ, codeobj_callback, NULL);
roctracer_status_t status = roctracer_enable_op_callback(
ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_CODEOBJ, codeobj_callback, NULL);
check_status(status);
}
@@ -79,9 +78,8 @@ extern "C" PUBLIC_API void OnUnloadTool() {
}
extern "C" CONSTRUCTOR_API void constructor() {
printf("constructor\n"); fflush(stdout);
printf("constructor\n");
fflush(stdout);
}
extern "C" DESTRUCTOR_API void destructor() {
OnUnloadTool();
}
extern "C" DESTRUCTOR_API void destructor() { OnUnloadTool(); }
+44 -57
Просмотреть файл
@@ -28,25 +28,24 @@
#define CONSTRUCTOR_API __attribute__((constructor))
#define DESTRUCTOR_API __attribute__((destructor))
#define HSA_RT(call) \
do { \
const hsa_status_t status = call; \
if (status != HSA_STATUS_SUCCESS) { \
printf("error \"%s\"\n", #call); fflush(stdout); \
abort(); \
} \
} while(0)
#define HSA_RT(call) \
do { \
const hsa_status_t status = call; \
if (status != HSA_STATUS_SUCCESS) { \
printf("error \"%s\"\n", #call); \
fflush(stdout); \
abort(); \
} \
} while (0)
// HSA API intercepting primitives
decltype(hsa_executable_freeze)* hsa_executable_freeze_fn;
hsa_ven_amd_loader_1_01_pfn_t loader_api_table{};
hsa_status_t code_object_callback(
hsa_executable_t executable,
hsa_loaded_code_object_t loaded_code_object,
void* arg)
{
printf("code_object_callback\n"); fflush(stdout);
hsa_status_t code_object_callback(hsa_executable_t executable,
hsa_loaded_code_object_t loaded_code_object, void* arg) {
printf("code_object_callback\n");
fflush(stdout);
uint64_t load_base = 0;
uint64_t load_size = 0;
@@ -55,21 +54,13 @@ hsa_status_t code_object_callback(
char* uri_str = NULL;
HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info(
loaded_code_object,
HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE,
&load_base));
loaded_code_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE, &load_base));
HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info(
loaded_code_object,
HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE,
&load_size));
loaded_code_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE, &load_size));
HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info(
loaded_code_object,
HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_DELTA,
&load_delta));
loaded_code_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_DELTA, &load_delta));
HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info(
loaded_code_object,
HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI_LENGTH,
&uri_len));
loaded_code_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI_LENGTH, &uri_len));
uri_str = (char*)calloc(uri_len + 1, sizeof(char));
if (!uri_str) {
@@ -78,63 +69,59 @@ hsa_status_t code_object_callback(
}
HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info(
loaded_code_object,
HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI,
uri_str));
loaded_code_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI, uri_str));
printf("load_base(0x%lx)\n", load_base); fflush(stdout);
printf("load_size(0x%lx)\n", load_size); fflush(stdout);
printf("load_delta(0x%lx)\n", load_delta); fflush(stdout);
printf("uri_len(%u)\n", uri_len); fflush(stdout);
printf("uri_str(\"%s\")\n", uri_str); fflush(stdout);
printf("load_base(0x%lx)\n", load_base);
fflush(stdout);
printf("load_size(0x%lx)\n", load_size);
fflush(stdout);
printf("load_delta(0x%lx)\n", load_delta);
fflush(stdout);
printf("uri_len(%u)\n", uri_len);
fflush(stdout);
printf("uri_str(\"%s\")\n", uri_str);
fflush(stdout);
free(uri_str);
return HSA_STATUS_SUCCESS;
}
hsa_status_t hsa_executable_freeze_interceptor(
hsa_executable_t executable,
const char *options)
{
hsa_status_t hsa_executable_freeze_interceptor(hsa_executable_t executable, const char* options) {
HSA_RT(loader_api_table.hsa_ven_amd_loader_executable_iterate_loaded_code_objects(
executable,
code_object_callback,
NULL));
HSA_RT(hsa_executable_freeze_fn(
executable,
options));
executable, code_object_callback, NULL));
HSA_RT(hsa_executable_freeze_fn(executable, options));
return HSA_STATUS_SUCCESS;
}
// HSA-runtime tool on-load method
extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table,
uint64_t runtime_version,
extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version,
uint64_t failed_tool_count,
const char* const* failed_tool_names)
{
printf("OnLoad: begin\n"); fflush(stdout);
const char* const* failed_tool_names) {
printf("OnLoad: begin\n");
fflush(stdout);
// intercepting hsa_executable_freeze API
hsa_executable_freeze_fn = table->core_->hsa_executable_freeze_fn;
table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor;
// Fetching AMD Loader HSA extension API
HSA_RT(hsa_system_get_major_extension_table(
HSA_EXTENSION_AMD_LOADER,
1,
sizeof(hsa_ven_amd_loader_1_01_pfn_t),
&loader_api_table));
printf("OnLoad: end\n"); fflush(stdout);
HSA_EXTENSION_AMD_LOADER, 1, sizeof(hsa_ven_amd_loader_1_01_pfn_t), &loader_api_table));
printf("OnLoad: end\n");
fflush(stdout);
return true;
}
extern "C" PUBLIC_API void OnUnload() {
printf("OnUnload\n"); fflush(stdout);
printf("OnUnload\n");
fflush(stdout);
}
extern "C" CONSTRUCTOR_API void constructor() {
printf("constructor\n"); fflush(stdout);
printf("constructor\n");
fflush(stdout);
}
extern "C" DESTRUCTOR_API void destructor() {
printf("destructor\n"); fflush(stdout);
printf("destructor\n");
fflush(stdout);
}
+99 -57
Просмотреть файл
@@ -67,13 +67,15 @@ static hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t pool, void* data, bool
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
err = HsaRsrcFactory::HsaApi()->hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
err = HsaRsrcFactory::HsaApi()->hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
CHECK_STATUS("hsa_amd_memory_pool_get_info", err);
if (HSA_AMD_SEGMENT_GLOBAL != segment) {
return HSA_STATUS_SUCCESS;
}
err = HsaRsrcFactory::HsaApi()->hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag);
err = HsaRsrcFactory::HsaApi()->hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag);
CHECK_STATUS("hsa_amd_memory_pool_get_info", err);
uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT;
@@ -126,19 +128,22 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize
#ifdef ROCP_LD_AQLPROFILE
status = LoadAqlProfileLib(&aqlprofile_api_);
#else
status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, hsa_ven_amd_aqlprofile_VERSION_MAJOR, sizeof(aqlprofile_api_), &aqlprofile_api_);
status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_AQLPROFILE,
hsa_ven_amd_aqlprofile_VERSION_MAJOR,
sizeof(aqlprofile_api_), &aqlprofile_api_);
#endif
CHECK_STATUS("aqlprofile API table load failed", status);
// Get Loader API table
loader_api_ = {0};
status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1, sizeof(loader_api_), &loader_api_);
status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1,
sizeof(loader_api_), &loader_api_);
CHECK_STATUS("loader API table query failed", status);
// Instantiate HSA timer
timer_ = new HsaTimer(&hsa_api_);
CHECK_STATUS("HSA timer allocation failed",
(timer_ == NULL) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS);
(timer_ == NULL) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS);
// Time correlation
const uint32_t corr_iters = 1000;
@@ -146,7 +151,8 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize
CorrelateTime(HsaTimer::TIME_ID_CLOCK_MONOTONIC, corr_iters);
// System timeout
timeout_ = (timeout_ns_ == HsaTimer::TIMESTAMP_MAX) ? timeout_ns_ : timer_->ns_to_sysclock(timeout_ns_);
timeout_ =
(timeout_ns_ == HsaTimer::TIMESTAMP_MAX) ? timeout_ns_ : timer_->ns_to_sysclock(timeout_ns_);
}
// Destructor of the class
@@ -172,9 +178,12 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) {
hsa_api_.hsa_queue_create = table->core_->hsa_queue_create_fn;
hsa_api_.hsa_queue_destroy = table->core_->hsa_queue_destroy_fn;
hsa_api_.hsa_queue_load_write_index_relaxed = table->core_->hsa_queue_load_write_index_relaxed_fn;
hsa_api_.hsa_queue_store_write_index_relaxed = table->core_->hsa_queue_store_write_index_relaxed_fn;
hsa_api_.hsa_queue_load_read_index_relaxed = table->core_->hsa_queue_load_read_index_relaxed_fn;
hsa_api_.hsa_queue_load_write_index_relaxed =
table->core_->hsa_queue_load_write_index_relaxed_fn;
hsa_api_.hsa_queue_store_write_index_relaxed =
table->core_->hsa_queue_store_write_index_relaxed_fn;
hsa_api_.hsa_queue_load_read_index_relaxed =
table->core_->hsa_queue_load_read_index_relaxed_fn;
hsa_api_.hsa_signal_create = table->core_->hsa_signal_create_fn;
hsa_api_.hsa_signal_destroy = table->core_->hsa_signal_destroy_fn;
@@ -183,27 +192,34 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) {
hsa_api_.hsa_signal_wait_scacquire = table->core_->hsa_signal_wait_scacquire_fn;
hsa_api_.hsa_signal_store_screlease = table->core_->hsa_signal_store_screlease_fn;
hsa_api_.hsa_code_object_reader_create_from_file = table->core_->hsa_code_object_reader_create_from_file_fn;
hsa_api_.hsa_code_object_reader_create_from_file =
table->core_->hsa_code_object_reader_create_from_file_fn;
hsa_api_.hsa_executable_create_alt = table->core_->hsa_executable_create_alt_fn;
hsa_api_.hsa_executable_load_agent_code_object = table->core_->hsa_executable_load_agent_code_object_fn;
hsa_api_.hsa_executable_load_agent_code_object =
table->core_->hsa_executable_load_agent_code_object_fn;
hsa_api_.hsa_executable_freeze = table->core_->hsa_executable_freeze_fn;
hsa_api_.hsa_executable_get_symbol = table->core_->hsa_executable_get_symbol_fn;
hsa_api_.hsa_executable_symbol_get_info = table->core_->hsa_executable_symbol_get_info_fn;
hsa_api_.hsa_executable_iterate_symbols = table->core_->hsa_executable_iterate_symbols_fn;
hsa_api_.hsa_system_get_info = table->core_->hsa_system_get_info_fn;
hsa_api_.hsa_system_get_major_extension_table = table->core_->hsa_system_get_major_extension_table_fn;
hsa_api_.hsa_system_get_major_extension_table =
table->core_->hsa_system_get_major_extension_table_fn;
hsa_api_.hsa_amd_agent_iterate_memory_pools = table->amd_ext_->hsa_amd_agent_iterate_memory_pools_fn;
hsa_api_.hsa_amd_agent_iterate_memory_pools =
table->amd_ext_->hsa_amd_agent_iterate_memory_pools_fn;
hsa_api_.hsa_amd_memory_pool_get_info = table->amd_ext_->hsa_amd_memory_pool_get_info_fn;
hsa_api_.hsa_amd_memory_pool_allocate = table->amd_ext_->hsa_amd_memory_pool_allocate_fn;
hsa_api_.hsa_amd_agents_allow_access = table->amd_ext_->hsa_amd_agents_allow_access_fn;
hsa_api_.hsa_amd_memory_async_copy = table->amd_ext_->hsa_amd_memory_async_copy_fn;
hsa_api_.hsa_amd_signal_async_handler = table->amd_ext_->hsa_amd_signal_async_handler_fn;
hsa_api_.hsa_amd_profiling_set_profiler_enabled = table->amd_ext_->hsa_amd_profiling_set_profiler_enabled_fn;
hsa_api_.hsa_amd_profiling_get_async_copy_time = table->amd_ext_->hsa_amd_profiling_get_async_copy_time_fn;
hsa_api_.hsa_amd_profiling_get_dispatch_time = table->amd_ext_->hsa_amd_profiling_get_dispatch_time_fn;
hsa_api_.hsa_amd_profiling_set_profiler_enabled =
table->amd_ext_->hsa_amd_profiling_set_profiler_enabled_fn;
hsa_api_.hsa_amd_profiling_get_async_copy_time =
table->amd_ext_->hsa_amd_profiling_get_async_copy_time_fn;
hsa_api_.hsa_amd_profiling_get_dispatch_time =
table->amd_ext_->hsa_amd_profiling_get_dispatch_time_fn;
} else {
hsa_api_.hsa_init = hsa_init;
hsa_api_.hsa_shut_down = hsa_shut_down;
@@ -298,10 +314,13 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) {
agent_info->dev_type = HSA_DEVICE_TYPE_CPU;
agent_info->dev_index = cpu_list_.size();
status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool);
status =
hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool);
if ((status == HSA_STATUS_INFO_BREAK) && (cpu_pool_ == NULL)) cpu_pool_ = &agent_info->cpu_pool;
status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, &agent_info->kern_arg_pool);
if ((status == HSA_STATUS_INFO_BREAK) && (kern_arg_pool_ == NULL)) kern_arg_pool_ = &agent_info->kern_arg_pool;
status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool,
&agent_info->kern_arg_pool);
if ((status == HSA_STATUS_INFO_BREAK) && (kern_arg_pool_ == NULL))
kern_arg_pool_ = &agent_info->kern_arg_pool;
agent_info->gpu_pool = {};
cpu_list_.push_back(agent_info);
@@ -319,21 +338,26 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) {
hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size);
hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile);
agent_info->is_apu = (agent_info->profile == HSA_PROFILE_FULL) ? true : false;
hsa_api_.hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT),
&agent_info->cu_num);
hsa_api_.hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU),
&agent_info->waves_per_cu);
hsa_api_.hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU),
&agent_info->simds_per_cu);
hsa_api_.hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES),
&agent_info->se_num);
hsa_api_.hsa_agent_get_info(
agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT),
&agent_info->cu_num);
hsa_api_.hsa_agent_get_info(agent,
static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE),
&agent_info->shader_arrays_per_se);
static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU),
&agent_info->waves_per_cu);
hsa_api_.hsa_agent_get_info(agent,
static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU),
&agent_info->simds_per_cu);
hsa_api_.hsa_agent_get_info(
agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES),
&agent_info->se_num);
hsa_api_.hsa_agent_get_info(
agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE),
&agent_info->shader_arrays_per_se);
agent_info->cpu_pool = {};
agent_info->kern_arg_pool = {};
status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool);
status =
hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool);
CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(gpu pool)", status);
// GFX8 and GFX9 SGPR/VGPR block sizes
@@ -430,7 +454,7 @@ bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts,
hsa_queue_t** queue) {
hsa_status_t status;
status = hsa_api_.hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL,
UINT32_MAX, UINT32_MAX, queue);
UINT32_MAX, UINT32_MAX, queue);
return (status == HSA_STATUS_SUCCESS);
}
@@ -453,7 +477,8 @@ uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t
hsa_status_t status = HSA_STATUS_ERROR;
uint8_t* buffer = NULL;
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
status = hsa_api_.hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, 0, reinterpret_cast<void**>(&buffer));
status = hsa_api_.hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, 0,
reinterpret_cast<void**>(&buffer));
uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL;
return ptr;
}
@@ -468,7 +493,8 @@ uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size
uint8_t* buffer = NULL;
if (!cpu_agents_.empty()) {
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
status = hsa_api_.hsa_amd_memory_pool_allocate(*kern_arg_pool_, size, 0, reinterpret_cast<void**>(&buffer));
status = hsa_api_.hsa_amd_memory_pool_allocate(*kern_arg_pool_, size, 0,
reinterpret_cast<void**>(&buffer));
// Both the CPU and GPU can access the kernel arguments
if (status == HSA_STATUS_SUCCESS) {
hsa_agent_t ag_list[1] = {agent_info->dev_id};
@@ -488,7 +514,8 @@ uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t s
uint8_t* buffer = NULL;
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
if (!cpu_agents_.empty()) {
status = hsa_api_.hsa_amd_memory_pool_allocate(*cpu_pool_, size, 0, reinterpret_cast<void**>(&buffer));
status = hsa_api_.hsa_amd_memory_pool_allocate(*cpu_pool_, size, 0,
reinterpret_cast<void**>(&buffer));
// Both the CPU and GPU can access the memory
if (status == HSA_STATUS_SUCCESS) {
hsa_agent_t ag_list[1] = {agent_info->dev_id};
@@ -513,16 +540,18 @@ uint8_t* HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t s
}
// Wait signal
hsa_signal_value_t HsaRsrcFactory::SignalWait(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const {
hsa_signal_value_t HsaRsrcFactory::SignalWait(const hsa_signal_t& signal,
const hsa_signal_value_t& signal_value) const {
const hsa_signal_value_t exp_value = signal_value - 1;
hsa_signal_value_t ret_value = signal_value;
while (1) {
ret_value =
hsa_api_.hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, signal_value, timeout_, HSA_WAIT_STATE_BLOCKED);
ret_value = hsa_api_.hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, signal_value,
timeout_, HSA_WAIT_STATE_BLOCKED);
if (ret_value == exp_value) break;
if (ret_value != signal_value) {
std::cerr << "Error: HsaRsrcFactory::SignalWait: signal_value(" << signal_value
<< "), ret_value(" << ret_value << ")" << std::endl << std::flush;
<< "), ret_value(" << ret_value << ")" << std::endl
<< std::flush;
abort();
}
}
@@ -530,7 +559,8 @@ hsa_signal_value_t HsaRsrcFactory::SignalWait(const hsa_signal_t& signal, const
}
// Wait signal with signal value restore
void HsaRsrcFactory::SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const {
void HsaRsrcFactory::SignalWaitRestore(const hsa_signal_t& signal,
const hsa_signal_value_t& signal_value) const {
SignalWait(signal, signal_value);
hsa_api_.hsa_signal_store_relaxed(const_cast<hsa_signal_t&>(signal), signal_value);
}
@@ -594,13 +624,13 @@ bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* br
}
// Create executable.
status = hsa_api_.hsa_executable_create_alt(HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
NULL, executable);
status = hsa_api_.hsa_executable_create_alt(
HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, executable);
CHECK_STATUS("Error in creating executable object", status);
// Load code object.
status = hsa_api_.hsa_executable_load_agent_code_object(*executable, agent_info->dev_id, code_obj_rdr,
NULL, NULL);
status = hsa_api_.hsa_executable_load_agent_code_object(*executable, agent_info->dev_id,
code_obj_rdr, NULL, NULL);
CHECK_STATUS("Error in loading executable object", status);
// Freeze executable.
@@ -610,7 +640,7 @@ bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* br
// Get symbol handle.
hsa_executable_symbol_t kernelSymbol;
status = hsa_api_.hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0,
&kernelSymbol);
&kernelSymbol);
CHECK_STATUS("Error in looking up kernel symbol", status);
// Update output parameter
@@ -654,7 +684,8 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) {
}
uint32_t slot_idx = (uint32_t)(write_idx % queue->size);
uint32_t* queue_slot = reinterpret_cast<uint32_t*>((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b));
uint32_t* queue_slot =
reinterpret_cast<uint32_t*>((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b));
const uint32_t* slot_data = reinterpret_cast<const uint32_t*>(packet);
// Copy buffered commands into the queue slot.
@@ -704,18 +735,22 @@ void HsaRsrcFactory::EnableExecutableTracking(HsaApiTable* table) {
table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor;
}
hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data) {
hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec,
hsa_executable_symbol_t symbol, void* data) {
hsa_symbol_kind_t value = (hsa_symbol_kind_t)0;
hsa_status_t status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &value);
hsa_status_t status =
hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &value);
CHECK_STATUS("Error in getting symbol info", status);
if (value == HSA_SYMBOL_KIND_KERNEL) {
uint64_t addr = 0;
uint32_t len = 0;
status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &addr);
status = hsa_api_.hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &addr);
CHECK_STATUS("Error in getting kernel object", status);
status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &len);
status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH,
&len);
CHECK_STATUS("Error in getting name len", status);
char *name = new char[len + 1];
char* name = new char[len + 1];
status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
CHECK_STATUS("Error in getting kernel name", status);
name[len] = 0;
@@ -728,12 +763,15 @@ hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec, hsa_ex
return HSA_STATUS_SUCCESS;
}
hsa_status_t HsaRsrcFactory::hsa_executable_freeze_interceptor(hsa_executable_t executable, const char *options) {
hsa_status_t HsaRsrcFactory::hsa_executable_freeze_interceptor(hsa_executable_t executable,
const char* options) {
std::lock_guard<mutex_t> lck(mutex_);
if (symbols_map_ == NULL) symbols_map_ = new symbols_map_t;
hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, NULL);
hsa_status_t status =
hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, NULL);
CHECK_STATUS("Error in iterating executable symbols", status);
return hsa_api_.hsa_executable_freeze(executable, options);;
return hsa_api_.hsa_executable_freeze(executable, options);
;
}
void HsaRsrcFactory::DumpHandles(FILE* file) {
@@ -741,10 +779,14 @@ void HsaRsrcFactory::DumpHandles(FILE* file) {
auto end = agent_map_.end();
for (auto it = beg; it != end; ++it) {
const AgentInfo* agent_info = it->second;
fprintf(file, "0x%lx agent %s\n", agent_info->dev_id.handle, (agent_info->dev_type == HSA_DEVICE_TYPE_CPU) ? "cpu" : "gpu");
if (agent_info->cpu_pool.handle != 0) fprintf(file, "0x%lx pool cpu\n", agent_info->cpu_pool.handle);
if (agent_info->kern_arg_pool.handle != 0) fprintf(file, "0x%lx pool cpu kernarg\n", agent_info->kern_arg_pool.handle);
if (agent_info->gpu_pool.handle != 0) fprintf(file, "0x%lx pool gpu\n", agent_info->gpu_pool.handle);
fprintf(file, "0x%lx agent %s\n", agent_info->dev_id.handle,
(agent_info->dev_type == HSA_DEVICE_TYPE_CPU) ? "cpu" : "gpu");
if (agent_info->cpu_pool.handle != 0)
fprintf(file, "0x%lx pool cpu\n", agent_info->cpu_pool.handle);
if (agent_info->kern_arg_pool.handle != 0)
fprintf(file, "0x%lx pool cpu kernarg\n", agent_info->kern_arg_pool.handle);
if (agent_info->gpu_pool.handle != 0)
fprintf(file, "0x%lx pool gpu\n", agent_info->gpu_pool.handle);
}
fflush(file);
}
+34 -30
Просмотреть файл
@@ -44,23 +44,25 @@
#define HSA_QUEUE_ALIGN_BYTES 64
#define HSA_PACKET_ALIGN_BYTES 64
#define CHECK_STATUS(msg, status) do { \
if ((status) != HSA_STATUS_SUCCESS) { \
const char* emsg = 0; \
hsa_status_string(status, &emsg); \
printf("%s: %s\n", msg, emsg ? emsg : "<unknown error>"); \
abort(); \
} \
} while (0)
#define CHECK_STATUS(msg, status) \
do { \
if ((status) != HSA_STATUS_SUCCESS) { \
const char* emsg = 0; \
hsa_status_string(status, &emsg); \
printf("%s: %s\n", msg, emsg ? emsg : "<unknown error>"); \
abort(); \
} \
} while (0)
#define CHECK_ITER_STATUS(msg, status) do { \
if ((status) != HSA_STATUS_INFO_BREAK) { \
const char* emsg = 0; \
hsa_status_string(status, &emsg); \
printf("%s: %s\n", msg, emsg ? emsg : "<unknown error>"); \
abort(); \
} \
} while (0)
#define CHECK_ITER_STATUS(msg, status) \
do { \
if ((status) != HSA_STATUS_INFO_BREAK) { \
const char* emsg = 0; \
hsa_status_string(status, &emsg); \
printf("%s: %s\n", msg, emsg ? emsg : "<unknown error>"); \
abort(); \
} \
} while (0)
static const size_t MEM_PAGE_BYTES = 0x1000;
static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1;
@@ -172,15 +174,12 @@ class HsaTimer {
static const timestamp_t TIMESTAMP_MAX = UINT64_MAX;
typedef long double freq_t;
enum time_id_t {
TIME_ID_CLOCK_REALTIME = 0,
TIME_ID_CLOCK_MONOTONIC = 1,
TIME_ID_NUMBER
};
enum time_id_t { TIME_ID_CLOCK_REALTIME = 0, TIME_ID_CLOCK_MONOTONIC = 1, TIME_ID_NUMBER };
HsaTimer(const hsa_pfn_t* hsa_api) : hsa_api_(hsa_api) {
timestamp_t sysclock_hz = 0;
hsa_status_t status = hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz);
hsa_status_t status =
hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz);
CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY)", status);
sysclock_factor_ = (freq_t)1000000000 / (freq_t)sysclock_hz;
}
@@ -215,8 +214,8 @@ class HsaTimer {
// Return pair of correlated values of profiling timestamp and time with
// correlation error for a given time ID and number of iterations
void correlated_pair_ns(time_id_t time_id, uint32_t iters,
timestamp_t* timestamp_v, timestamp_t* time_v, timestamp_t* error_v) const {
void correlated_pair_ns(time_id_t time_id, uint32_t iters, timestamp_t* timestamp_v,
timestamp_t* time_v, timestamp_t* error_v) const {
clockid_t clock_id = 0;
switch (clock_id) {
case TIME_ID_CLOCK_REALTIME:
@@ -355,7 +354,8 @@ class HsaRsrcFactory {
uint8_t* AllocateCmdMemory(const AgentInfo* agent_info, size_t size);
// Wait signal
hsa_signal_value_t SignalWait(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const;
hsa_signal_value_t SignalWait(const hsa_signal_t& signal,
const hsa_signal_value_t& signal_value) const;
// Wait signal with signal value restore
void SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const;
@@ -401,7 +401,9 @@ class HsaRsrcFactory {
const hsa_ven_amd_loader_1_00_pfn_t* LoaderApi() const { return &loader_api_; }
// Methods for system-clock/ns conversion and timestamp in 'ns'
timestamp_t SysclockToNs(const timestamp_t& sysclock) const { return timer_->sysclock_to_ns(sysclock); }
timestamp_t SysclockToNs(const timestamp_t& sysclock) const {
return timer_->sysclock_to_ns(sysclock);
}
timestamp_t NsToSysclock(const timestamp_t& time) const { return timer_->ns_to_sysclock(time); }
timestamp_t TimestampNs() const { return timer_->timestamp_ns(); }
@@ -480,8 +482,10 @@ class HsaRsrcFactory {
typedef std::map<uint64_t, const char*> symbols_map_t;
static symbols_map_t* symbols_map_;
static bool executable_tracking_on_;
static hsa_status_t hsa_executable_freeze_interceptor(hsa_executable_t executable, const char *options);
static hsa_status_t executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data);
static hsa_status_t hsa_executable_freeze_interceptor(hsa_executable_t executable,
const char* options);
static hsa_status_t executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol,
void* data);
// HSA runtime API table
static hsa_pfn_t hsa_api_;
@@ -505,8 +509,8 @@ class HsaRsrcFactory {
timestamp_t time_error_[HsaTimer::TIME_ID_NUMBER];
// CPU/kern-arg memory pools
hsa_amd_memory_pool_t *cpu_pool_;
hsa_amd_memory_pool_t *kern_arg_pool_;
hsa_amd_memory_pool_t* cpu_pool_;
hsa_amd_memory_pool_t* kern_arg_pool_;
};
#endif // _HSA_RSRC_FACTORY_H_
+3 -1
Просмотреть файл
@@ -24,7 +24,9 @@
#include "ctrl/test_hsa.h"
#include "util/test_assert.h"
template <class Kernel, class Test> bool RunKernel(int argc = 0, char* argv[] = NULL, const AgentInfo* agent_info = NULL, hsa_queue_t* queue = NULL, int count = 1) {
template <class Kernel, class Test>
bool RunKernel(int argc = 0, char* argv[] = NULL, const AgentInfo* agent_info = NULL,
hsa_queue_t* queue = NULL, int count = 1) {
bool ret_val = false;
if (getenv("ROC_TEST_TRACE") == NULL) std::clog.rdbuf(NULL);
+6 -4
Просмотреть файл
@@ -63,7 +63,8 @@ bool TestHsa::Initialize(int /*arg_cnt*/, char** /*arg_list*/) {
return false;
}
}
std::clog << "> Using agent[" << agent_info_->dev_index << "] : " << agent_info_->name << std::endl;
std::clog << "> Using agent[" << agent_info_->dev_index << "] : " << agent_info_->name
<< std::endl;
// Create an instance of Aql Queue
if (hsa_queue_ == NULL) {
@@ -116,8 +117,8 @@ bool TestHsa::Setup() {
size_t size_info = 0;
const hsa_status_t status = hsa_executable_symbol_get_info(
kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &size_info);
TEST_ASSERT(status == HSA_STATUS_SUCCESS);
size_info = kernarg_size;
TEST_ASSERT(status == HSA_STATUS_SUCCESS);
size_info = kernarg_size;
const bool kernarg_missmatch = (kernarg_size > size_info);
if (kernarg_missmatch) {
std::cout << "kernarg_size = " << kernarg_size << ", size_info = " << size_info
@@ -209,7 +210,8 @@ bool TestHsa::Run() {
// Submit AQL packet to the queue
const uint64_t que_idx = hsa_rsrc_->Submit(hsa_queue_, &aql);
std::clog << "> Waiting on kernel dispatch signal, que_idx=" << que_idx << std::endl << std::flush;
std::clog << "> Waiting on kernel dispatch signal, que_idx=" << que_idx << std::endl
<< std::flush;
// Wait on the dispatch signal until the kernel is finished.
// Update wait condition to HSA_WAIT_STATE_ACTIVE for Polling
+4 -5
Просмотреть файл
@@ -33,10 +33,7 @@ class DummyKernel : public TestKernel {
enum { KERNARG_BUF_ID, LOCAL_BUF_ID };
// Constructor
DummyKernel() :
width_(64),
height_(64)
{
DummyKernel() : width_(64), height_(64) {
SetInDescr(KERNARG_BUF_ID, KERNARG_DES_ID, 0);
SetOutDescr(LOCAL_BUF_ID, LOCAL_DES_ID, 0);
}
@@ -57,7 +54,9 @@ class DummyKernel : public TestKernel {
// Reference CPU implementation
bool ReferenceImplementation(uint32_t* output, const uint32_t* input, const float* mask,
const uint32_t width, const uint32_t height,
const uint32_t maskWidth, const uint32_t maskHeight) { return true; }
const uint32_t maskWidth, const uint32_t maskHeight) {
return true;
}
// Width of the Input array
const uint32_t width_;
+13 -16
Просмотреть файл
@@ -28,9 +28,8 @@
#include <sstream>
#include <utility>
template <class evt_id_t, class evt_weight_t>
class EvtStatsT {
public:
template <class evt_id_t, class evt_weight_t> class EvtStatsT {
public:
typedef std::mutex mutex_t;
typedef uint64_t evt_count_t;
typedef double evt_avr_t;
@@ -51,7 +50,7 @@ class EvtStatsT {
inline void add_event(evt_id_t id, evt_weight_t weight) {
std::lock_guard<mutex_t> lck(mutex_);
//printf("EvtStats %p ::add_event %u %lu\n", this, id, weight); fflush(stdout);
// printf("EvtStats %p ::add_event %u %lu\n", this, id, weight); fflush(stdout);
evt_record_t& rec = map_[id];
const evt_count_t prev_count = rec.count;
@@ -65,7 +64,8 @@ class EvtStatsT {
void dump() {
std::lock_guard<mutex_t> lck(mutex_);
fprintf(stdout, "Dumping %s\n", path_); fflush(stdout);
fprintf(stdout, "Dumping %s\n", path_);
fflush(stdout);
typedef typename std::set<std::pair<evt_id_t, evt_record_t>, cmpfun> set_t;
set_t s_(map_.begin(), map_.end());
@@ -75,7 +75,8 @@ class EvtStatsT {
const evt_id_t id = e.first;
const char* label = get_label(id);
std::ostringstream oss;
oss << index << ",\"" << label << "\"," << e.second.count << "," << (uint64_t)(e.second.avr) << "," << (uint64_t)(e.second.count * e.second.avr);
oss << index << ",\"" << label << "\"," << e.second.count << "," << (uint64_t)(e.second.avr)
<< "," << (uint64_t)(e.second.count * e.second.avr);
fprintf(fdes_, "%s\n", oss.str().c_str());
index += 1;
}
@@ -88,24 +89,20 @@ class EvtStatsT {
const char* label = ret.first->second;
return label;
}
const char* get_label(const char* id) {
return id;
}
const char* get_label(const std::string& id) {
return id.c_str();
}
const char* get_label(const char* id) { return id; }
const char* get_label(const std::string& id) { return id.c_str(); }
void set_label(evt_id_t id, const char* label) {
//printf("EvtStats %p ::set_label %u %s\n", this, id, label); fflush(stdout);
// printf("EvtStats %p ::set_label %u %s\n", this, id, label); fflush(stdout);
labels_[id] = label;
}
EvtStatsT(FILE* f, const char* path) : fdes_(f), path_(path) {
//printf("EvtStats %p ::EvtStatsT()\n", this); fflush(stdout);
// printf("EvtStats %p ::EvtStatsT()\n", this); fflush(stdout);
fprintf(fdes_, "Index,Name,Count,Avr,Total\n");
}
private:
private:
mutex_t mutex_;
map_t map_;
labels_t labels_;
@@ -115,4 +112,4 @@ class EvtStatsT {
typedef EvtStatsT<uint32_t, uint64_t> EvtStats;
#endif // EVT_STATS_H_
#endif // EVT_STATS_H_
+4 -2
Просмотреть файл
@@ -212,7 +212,8 @@ class Xml {
buf[size - 1] = '\0';
if (strncmp(buf, "#include \"", 10) == 0) {
for (ind = 0; (ind < size) && (buf[ind] != '\n'); ++ind) {}
for (ind = 0; (ind < size) && (buf[ind] != '\n'); ++ind) {
}
if (ind == size) {
fprintf(stderr, "XML PreProcess failed, line size limit %zu\n", kBufSize);
error = true;
@@ -222,7 +223,8 @@ class Xml {
size = ind;
lseek(fd_, pos + ind + 1, SEEK_SET);
for (ind = 10; (ind < size) && (buf[ind] != '"'); ++ind) {}
for (ind = 10; (ind < size) && (buf[ind] != '"'); ++ind) {
}
if (ind == size) {
error = true;
break;
+211 -191
Просмотреть файл
@@ -21,15 +21,15 @@
#include <sstream>
#include <string>
#include <cxxabi.h> /* names denangle */
#include <cxxabi.h> /* names denangle */
#include <dirent.h>
#include <pthread.h>
#include <stdarg.h>
#include <stdio.h>
#include <string.h>
#include <sys/syscall.h> /* SYS_xxx definitions */
#include <sys/syscall.h> /* SYS_xxx definitions */
#include <sys/types.h>
#include <unistd.h> /* usleep */
#include <unistd.h> /* usleep */
#include <roctracer_ext.h>
#include "src/util/exception.h"
@@ -70,10 +70,12 @@
} \
} while (0)
#define ONLOAD_TRACE(str) \
if (getenv("ROCP_ONLOAD_TRACE")) do { \
std::cout << "PID(" << GetPid() << "): TRACER_TOOL::" << __FUNCTION__ << " " << str << std::endl << std::flush; \
} while(0);
#define ONLOAD_TRACE(str) \
if (getenv("ROCP_ONLOAD_TRACE")) do { \
std::cout << "PID(" << GetPid() << "): TRACER_TOOL::" << __FUNCTION__ << " " << str \
<< std::endl \
<< std::flush; \
} while (0);
#define ONLOAD_TRACE_BEG() ONLOAD_TRACE("begin")
#define ONLOAD_TRACE_END() ONLOAD_TRACE("end")
@@ -88,7 +90,8 @@ inline static void DEBUG_TRACE(const char* fmt, ...) {
va_list valist;
va_start(valist, fmt);
vsnprintf(buf, size, fmt, valist);
printf("%u:%u %s", GetPid(), GetTid(), buf); fflush(stdout);
printf("%u:%u %s", GetPid(), GetTid(), buf);
fflush(stdout);
va_end(valist);
}
#else
@@ -107,18 +110,18 @@ bool trace_hip_activity = false;
bool trace_pcs = false;
// The below getter functions have been written intentionally to fix an issue
// with constructor ordering. Previously when hip_api_vec and hsa_api_vec
// with constructor ordering. Previously when hip_api_vec and hsa_api_vec
// were left as simple global variables, whenever the tool_load() function
// was called from "extern "C" CONSTRUCTOR_API void constructor()" of libtracer_tool.so
// the ordering of std::vector constructor becomes undefined. This meant that you could assign
// hip_api_vec and hsa_api_vec with a value in tool_load() and once the function returns, the std::vector
// default constructor would execute later, causing the values to be lost.
// hip_api_vec and hsa_api_vec with a value in tool_load() and once the function returns, the
// std::vector default constructor would execute later, causing the values to be lost.
static std::vector<std::string> &hsa_api_vec() {
static std::vector<std::string>& hsa_api_vec() {
static std::vector<std::string> hsa_api_vec;
return hsa_api_vec;
}
static std::vector<std::string> &hip_api_vec() {
static std::vector<std::string>& hip_api_vec() {
static std::vector<std::string> hip_api_vec;
return hip_api_vec;
}
@@ -167,7 +170,8 @@ void fatal(const std::string msg) {
static inline const char* cxx_demangle(const char* symbol) {
size_t funcnamesize;
int status;
const char* ret = (symbol != NULL) ? abi::__cxa_demangle(symbol, NULL, &funcnamesize, &status) : symbol;
const char* ret =
(symbol != NULL) ? abi::__cxa_demangle(symbol, NULL, &funcnamesize, &status) : symbol;
return (ret != NULL) ? ret : strdup(symbol);
}
@@ -208,7 +212,8 @@ void* control_thr_fun(void*) {
uint32_t control_flush_us = 0;
pthread_t flush_thread;
bool flush_thread_started = false;
std::mutex flush_thread_mutex;;
std::mutex flush_thread_mutex;
;
void* flush_thr_fun(void*) {
const uint32_t dist_sec = control_flush_us / 1000000;
@@ -218,7 +223,8 @@ void* flush_thr_fun(void*) {
sleep(dist_sec);
usleep(dist_us);
std::lock_guard<std::mutex> lock(flush_thread_mutex);
if (!flush_thread_started) while(1) sleep(1);
if (!flush_thread_started)
while (1) sleep(1);
ROCTRACER_CALL(roctracer_flush_activity());
roctracer::TraceBufferBase::FlushAll();
}
@@ -241,17 +247,13 @@ struct roctx_trace_entry_t {
};
void roctx_flush_cb(roctx_trace_entry_t* entry);
constexpr roctracer::TraceBuffer<roctx_trace_entry_t>::flush_prm_t roctx_flush_prm = {roctracer::DFLT_ENTRY_TYPE, roctx_flush_cb};
constexpr roctracer::TraceBuffer<roctx_trace_entry_t>::flush_prm_t roctx_flush_prm = {
roctracer::DFLT_ENTRY_TYPE, roctx_flush_cb};
roctracer::TraceBuffer<roctx_trace_entry_t>* roctx_trace_buffer = NULL;
// rocTX callback function
static inline void roctx_callback_fun(
uint32_t domain,
uint32_t cid,
uint32_t tid,
roctx_range_id_t rid,
const char* message)
{
static inline void roctx_callback_fun(uint32_t domain, uint32_t cid, uint32_t tid,
roctx_range_id_t rid, const char* message) {
#if ROCTX_CLOCK_TIME
const timestamp_t time = HsaTimer::clocktime_ns(HsaTimer::TIME_ID_CLOCK_MONOTONIC);
#else
@@ -267,12 +269,7 @@ static inline void roctx_callback_fun(
entry->valid.store(roctracer::TRACE_ENTRY_COMPL, std::memory_order_release);
}
void roctx_api_callback(
uint32_t domain,
uint32_t cid,
const void* callback_data,
void* arg)
{
void roctx_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg) {
(void)arg;
const roctx_api_data_t* data = reinterpret_cast<const roctx_api_data_t*>(callback_data);
roctx_callback_fun(domain, cid, GetTid(), data->args.id, data->args.message);
@@ -280,27 +277,37 @@ void roctx_api_callback(
// rocTX Start/Stop callbacks
void roctx_range_start_callback(const roctx_range_data_t* data, void* arg) {
roctx_callback_fun(ACTIVITY_DOMAIN_ROCTX, ROCTX_API_ID_roctxRangePushA, data->tid, 0, data->message);
roctx_callback_fun(ACTIVITY_DOMAIN_ROCTX, ROCTX_API_ID_roctxRangePushA, data->tid, 0,
data->message);
}
void roctx_range_stop_callback(const roctx_range_data_t* data, void* arg) {
roctx_callback_fun(ACTIVITY_DOMAIN_ROCTX, ROCTX_API_ID_roctxRangePop, data->tid, 0, NULL);
}
void start_callback() { roctracer::RocTxLoader::Instance().RangeStackIterate(roctx_range_start_callback, NULL); }
void stop_callback() { roctracer::RocTxLoader::Instance().RangeStackIterate(roctx_range_stop_callback, NULL); }
void start_callback() {
roctracer::RocTxLoader::Instance().RangeStackIterate(roctx_range_start_callback, NULL);
}
void stop_callback() {
roctracer::RocTxLoader::Instance().RangeStackIterate(roctx_range_stop_callback, NULL);
}
// rocTX buffer flush function
void roctx_flush_cb(roctx_trace_entry_t* entry) {
#if ROCTX_CLOCK_TIME
timestamp_t timestamp = 0;
HsaRsrcFactory::Instance().GetTimestamp(HsaTimer::TIME_ID_CLOCK_MONOTONIC, entry->time, &timestamp);
HsaRsrcFactory::Instance().GetTimestamp(HsaTimer::TIME_ID_CLOCK_MONOTONIC, entry->time,
&timestamp);
#else
const timestamp_t timestamp = entry->time;
#endif
std::ostringstream os;
os << timestamp << " " << entry->pid << ":" << entry->tid << " " << entry->cid << ":" << entry->rid;
if (entry->message != NULL) os << ":\"" << entry->message << "\"";
else os << ":\"\"";
fprintf(roctx_file_handle, "%s\n", os.str().c_str()); fflush(roctx_file_handle);
os << timestamp << " " << entry->pid << ":" << entry->tid << " " << entry->cid << ":"
<< entry->rid;
if (entry->message != NULL)
os << ":\"" << entry->message << "\"";
else
os << ":\"\"";
fprintf(roctx_file_handle, "%s\n", os.str().c_str());
fflush(roctx_file_handle);
}
///////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -318,24 +325,20 @@ struct hsa_api_trace_entry_t {
};
void hsa_api_flush_cb(hsa_api_trace_entry_t* entry);
constexpr roctracer::TraceBuffer<hsa_api_trace_entry_t>::flush_prm_t hsa_flush_prm = {roctracer::DFLT_ENTRY_TYPE, hsa_api_flush_cb};
constexpr roctracer::TraceBuffer<hsa_api_trace_entry_t>::flush_prm_t hsa_flush_prm = {
roctracer::DFLT_ENTRY_TYPE, hsa_api_flush_cb};
roctracer::TraceBuffer<hsa_api_trace_entry_t>* hsa_api_trace_buffer = NULL;
// HSA API callback function
void hsa_api_callback(
uint32_t domain,
uint32_t cid,
const void* callback_data,
void* arg)
{
void hsa_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg) {
(void)arg;
const hsa_api_data_t* data = reinterpret_cast<const hsa_api_data_t*>(callback_data);
if (data->phase == ACTIVITY_API_PHASE_ENTER) {
hsa_begin_timestamp = timer->timestamp_fn_ns();
} else {
const timestamp_t end_timestamp = (cid == HSA_API_ID_hsa_shut_down) ? hsa_begin_timestamp : timer->timestamp_fn_ns();
const timestamp_t end_timestamp =
(cid == HSA_API_ID_hsa_shut_down) ? hsa_begin_timestamp : timer->timestamp_fn_ns();
hsa_api_trace_entry_t* entry = hsa_api_trace_buffer->GetEntry();
entry->cid = cid;
entry->begin = hsa_begin_timestamp;
@@ -349,17 +352,17 @@ void hsa_api_callback(
void hsa_api_flush_cb(hsa_api_trace_entry_t* entry) {
std::ostringstream os;
os << entry->begin << ":" << entry->end << " " << entry->pid << ":" << entry->tid << " " << hsa_api_data_pair_t(entry->cid, entry->data);
fprintf(hsa_api_file_handle, "%s\n", os.str().c_str()); fflush(hsa_api_file_handle);
os << entry->begin << ":" << entry->end << " " << entry->pid << ":" << entry->tid << " "
<< hsa_api_data_pair_t(entry->cid, entry->data);
fprintf(hsa_api_file_handle, "%s\n", os.str().c_str());
fflush(hsa_api_file_handle);
}
void hsa_activity_callback(
uint32_t op,
activity_record_t* record,
void* arg)
{
void hsa_activity_callback(uint32_t op, activity_record_t* record, void* arg) {
static uint64_t index = 0;
fprintf(hsa_async_copy_file_handle, "%lu:%lu async-copy:%lu:%u\n", record->begin_ns, record->end_ns, index, my_pid); fflush(hsa_async_copy_file_handle);
fprintf(hsa_async_copy_file_handle, "%lu:%lu async-copy:%lu:%u\n", record->begin_ns,
record->end_ns, index, my_pid);
fflush(hsa_async_copy_file_handle);
index++;
}
@@ -381,28 +384,21 @@ struct hip_api_trace_entry_t {
};
void hip_api_flush_cb(hip_api_trace_entry_t* entry);
constexpr roctracer::TraceBuffer<hip_api_trace_entry_t>::flush_prm_t hip_api_flush_prm = {roctracer::DFLT_ENTRY_TYPE, hip_api_flush_cb};
constexpr roctracer::TraceBuffer<hip_api_trace_entry_t>::flush_prm_t hip_api_flush_prm = {
roctracer::DFLT_ENTRY_TYPE, hip_api_flush_cb};
roctracer::TraceBuffer<hip_api_trace_entry_t>* hip_api_trace_buffer = NULL;
static inline bool is_hip_kernel_launch_api(const uint32_t& cid) {
bool ret =
(cid == HIP_API_ID_hipLaunchKernel) ||
(cid == HIP_API_ID_hipExtLaunchKernel) ||
(cid == HIP_API_ID_hipLaunchCooperativeKernel) ||
(cid == HIP_API_ID_hipLaunchCooperativeKernelMultiDevice) ||
(cid == HIP_API_ID_hipExtLaunchMultiKernelMultiDevice) ||
(cid == HIP_API_ID_hipModuleLaunchKernel) ||
(cid == HIP_API_ID_hipExtModuleLaunchKernel) ||
(cid == HIP_API_ID_hipHccModuleLaunchKernel);
bool ret = (cid == HIP_API_ID_hipLaunchKernel) || (cid == HIP_API_ID_hipExtLaunchKernel) ||
(cid == HIP_API_ID_hipLaunchCooperativeKernel) ||
(cid == HIP_API_ID_hipLaunchCooperativeKernelMultiDevice) ||
(cid == HIP_API_ID_hipExtLaunchMultiKernelMultiDevice) ||
(cid == HIP_API_ID_hipModuleLaunchKernel) || (cid == HIP_API_ID_hipExtModuleLaunchKernel) ||
(cid == HIP_API_ID_hipHccModuleLaunchKernel);
return ret;
}
void hip_api_callback(
uint32_t domain,
uint32_t cid,
const void* callback_data,
void* arg)
{
void hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg) {
(void)arg;
const hip_api_data_t* data = reinterpret_cast<const hip_api_data_t*>(callback_data);
const timestamp_t timestamp = timer->timestamp_fn_ns();
@@ -428,16 +424,17 @@ void hip_api_callback(
if (cid == HIP_API_ID_hipMalloc) {
entry->ptr = *(data->args.hipMalloc.ptr);
} else if (is_hip_kernel_launch_api(cid)) {
switch(cid) {
switch (cid) {
case HIP_API_ID_hipExtLaunchMultiKernelMultiDevice:
case HIP_API_ID_hipLaunchCooperativeKernelMultiDevice:
{
const hipLaunchParams* listKernels = data->args.hipLaunchCooperativeKernelMultiDevice.launchParamsList;
case HIP_API_ID_hipLaunchCooperativeKernelMultiDevice: {
const hipLaunchParams* listKernels =
data->args.hipLaunchCooperativeKernelMultiDevice.launchParamsList;
std::string name_str = "";
for (int i = 0; i < data->args.hipLaunchCooperativeKernelMultiDevice.numDevices; ++i) {
const hipLaunchParams& lp = listKernels[i];
if (lp.func != NULL) {
const char* kernel_name = roctracer::HipLoader::Instance().KernelNameRefByPtr(lp.func, lp.stream);
const char* kernel_name =
roctracer::HipLoader::Instance().KernelNameRefByPtr(lp.func, lp.stream);
const int device_id = roctracer::HipLoader::Instance().GetStreamDeviceId(lp.stream);
name_str += std::string(kernel_name) + ":" + std::to_string(device_id) + ";";
}
@@ -446,22 +443,21 @@ void hip_api_callback(
break;
}
case HIP_API_ID_hipLaunchKernel:
case HIP_API_ID_hipLaunchCooperativeKernel:
{
case HIP_API_ID_hipLaunchCooperativeKernel: {
const void* f = data->args.hipLaunchKernel.function_address;
hipStream_t stream = data->args.hipLaunchKernel.stream;
if (f != NULL) entry->name = strdup(roctracer::HipLoader::Instance().KernelNameRefByPtr(f, stream));
if (f != NULL)
entry->name = strdup(roctracer::HipLoader::Instance().KernelNameRefByPtr(f, stream));
break;
}
case HIP_API_ID_hipExtLaunchKernel:
{
case HIP_API_ID_hipExtLaunchKernel: {
const void* f = data->args.hipExtLaunchKernel.function_address;
hipStream_t stream = data->args.hipExtLaunchKernel.stream;
if (f != NULL) entry->name = strdup(roctracer::HipLoader::Instance().KernelNameRefByPtr(f, stream));
if (f != NULL)
entry->name = strdup(roctracer::HipLoader::Instance().KernelNameRefByPtr(f, stream));
break;
}
default:
{
default: {
const hipFunction_t f = data->args.hipModuleLaunchKernel.f;
if (f != NULL) entry->name = strdup(roctracer::HipLoader::Instance().KernelNameRef(f));
}
@@ -471,17 +467,15 @@ void hip_api_callback(
entry->valid.store(roctracer::TRACE_ENTRY_COMPL, std::memory_order_release);
}
const char * name = roctracer_op_string(domain, cid, 0);
DEBUG_TRACE("hip_api_callback(\"%s\") phase(%d): cid(%u) data(%p) entry(%p) name(\"%s\") correlation_id(%lu) timestamp(%lu)\n",
name, data->phase, cid, data, entry, (entry) ? entry->name : NULL, data->correlation_id, timestamp);
const char* name = roctracer_op_string(domain, cid, 0);
DEBUG_TRACE(
"hip_api_callback(\"%s\") phase(%d): cid(%u) data(%p) entry(%p) name(\"%s\") "
"correlation_id(%lu) timestamp(%lu)\n",
name, data->phase, cid, data, entry, (entry) ? entry->name : NULL, data->correlation_id,
timestamp);
}
void mark_api_callback(
uint32_t domain,
uint32_t cid,
const void* callback_data,
void* arg)
{
void mark_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg) {
(void)arg;
const char* name = reinterpret_cast<const char*>(callback_data);
@@ -513,22 +507,27 @@ void hip_api_flush_cb(hip_api_trace_entry_t* entry) {
std::ostringstream rec_ss;
std::ostringstream oss;
const char* str = (domain != ACTIVITY_DOMAIN_EXT_API) ? roctracer_op_string(domain, cid, 0) : strdup("MARK");
rec_ss << std::dec << begin_timestamp << ":" << end_timestamp << " " << entry->pid << ":" << entry->tid;
const char* str =
(domain != ACTIVITY_DOMAIN_EXT_API) ? roctracer_op_string(domain, cid, 0) : strdup("MARK");
rec_ss << std::dec << begin_timestamp << ":" << end_timestamp << " " << entry->pid << ":"
<< entry->tid;
oss << std::dec << rec_ss.str() << " " << str;
const char * name = roctracer_op_string(entry->domain, entry->cid, 0);
DEBUG_TRACE("hip_api_flush_cb(\"%s\"): domain(%u) cid(%u) entry(%p) name(\"%s\" correlation_id(%lu) beg(%lu) end(%lu))\n",
name, entry->domain, entry->cid, entry, entry->name, correlation_id, begin_timestamp, end_timestamp);
const char* name = roctracer_op_string(entry->domain, entry->cid, 0);
DEBUG_TRACE(
"hip_api_flush_cb(\"%s\"): domain(%u) cid(%u) entry(%p) name(\"%s\" correlation_id(%lu) "
"beg(%lu) end(%lu))\n",
name, entry->domain, entry->cid, entry, entry->name, correlation_id, begin_timestamp,
end_timestamp);
if (domain == ACTIVITY_DOMAIN_HIP_API) {
#if HIP_PROF_HIP_API_STRING
if (hip_api_stats != NULL) {
hip_api_stats->add_event(cid, end_timestamp - begin_timestamp);
if (is_hip_kernel_launch_api(cid)) {
hip_kernel_mutex.lock();
hip_kernel_mutex.lock();
(*hip_kernel_map)[correlation_id] = entry->name;
hip_kernel_mutex.unlock();
hip_kernel_mutex.unlock();
}
} else {
const char* str = hipApiString((hip_api_id_t)cid, data);
@@ -537,50 +536,36 @@ void hip_api_flush_cb(hip_api_trace_entry_t* entry) {
const char* kernel_name = cxx_demangle(entry->name);
rec_ss << " kernel=" << kernel_name;
}
rec_ss<< " :" << correlation_id;
rec_ss << " :" << correlation_id;
fprintf(hip_api_file_handle, "%s\n", rec_ss.str().c_str());
}
#else // !HIP_PROF_HIP_API_STRING
#else // !HIP_PROF_HIP_API_STRING
switch (cid) {
case HIP_API_ID_hipMemcpy:
fprintf(hip_api_file_handle, "%s(dst(%p) src(%p) size(0x%x) kind(%u))\n",
oss.str().c_str(),
data->args.hipMemcpy.dst,
data->args.hipMemcpy.src,
(uint32_t)(data->args.hipMemcpy.sizeBytes),
(uint32_t)(data->args.hipMemcpy.kind));
fprintf(hip_api_file_handle, "%s(dst(%p) src(%p) size(0x%x) kind(%u))\n", oss.str().c_str(),
data->args.hipMemcpy.dst, data->args.hipMemcpy.src,
(uint32_t)(data->args.hipMemcpy.sizeBytes), (uint32_t)(data->args.hipMemcpy.kind));
break;
case HIP_API_ID_hipMemcpyAsync:
fprintf(hip_api_file_handle, "%s(dst(%p) src(%p) size(0x%x) kind(%u) stream(%p))\n",
oss.str().c_str(),
data->args.hipMemcpyAsync.dst,
data->args.hipMemcpyAsync.src,
(uint32_t)(data->args.hipMemcpyAsync.sizeBytes),
(uint32_t)(data->args.hipMemcpyAsync.kind),
data->args.hipMemcpyAsync.stream);
oss.str().c_str(), data->args.hipMemcpyAsync.dst, data->args.hipMemcpyAsync.src,
(uint32_t)(data->args.hipMemcpyAsync.sizeBytes),
(uint32_t)(data->args.hipMemcpyAsync.kind), data->args.hipMemcpyAsync.stream);
break;
case HIP_API_ID_hipMalloc:
fprintf(hip_api_file_handle, "%s(ptr(%p) size(0x%x))\n",
oss.str().c_str(),
entry->ptr,
(uint32_t)(data->args.hipMalloc.size));
fprintf(hip_api_file_handle, "%s(ptr(%p) size(0x%x))\n", oss.str().c_str(), entry->ptr,
(uint32_t)(data->args.hipMalloc.size));
break;
case HIP_API_ID_hipFree:
fprintf(hip_api_file_handle, "%s(ptr(%p))\n",
oss.str().c_str(),
data->args.hipFree.ptr);
fprintf(hip_api_file_handle, "%s(ptr(%p))\n", oss.str().c_str(), data->args.hipFree.ptr);
break;
case HIP_API_ID_hipModuleLaunchKernel:
fprintf(hip_api_file_handle, "%s(kernel(%s) stream(%p))\n",
oss.str().c_str(),
cxx_demangle(entry->name),
data->args.hipModuleLaunchKernel.stream);
fprintf(hip_api_file_handle, "%s(kernel(%s) stream(%p))\n", oss.str().c_str(),
cxx_demangle(entry->name), data->args.hipModuleLaunchKernel.stream);
break;
case HIP_API_ID_hipExtModuleLaunchKernel:
fprintf(hip_api_file_handle, "%s(kernel(%s) stream(%p))\n",
oss.str().c_str(),
cxx_demangle(entry->name),
data->args.hipExtModuleLaunchKernel.hStream);
fprintf(hip_api_file_handle, "%s(kernel(%s) stream(%p))\n", oss.str().c_str(),
cxx_demangle(entry->name), data->args.hipExtModuleLaunchKernel.hStream);
break;
default:
fprintf(hip_api_file_handle, "%s()\n", oss.str().c_str());
@@ -605,23 +590,26 @@ struct hip_act_trace_entry_t {
};
void hip_act_flush_cb(hip_act_trace_entry_t* entry);
constexpr roctracer::TraceBuffer<hip_act_trace_entry_t>::flush_prm_t hip_act_flush_prm = {roctracer::DFLT_ENTRY_TYPE, hip_act_flush_cb};
constexpr roctracer::TraceBuffer<hip_act_trace_entry_t>::flush_prm_t hip_act_flush_prm = {
roctracer::DFLT_ENTRY_TYPE, hip_act_flush_cb};
roctracer::TraceBuffer<hip_act_trace_entry_t>* hip_act_trace_buffer = NULL;
// HIP ACT trace buffer flush callback
void hip_act_flush_cb(hip_act_trace_entry_t* entry) {
const uint32_t domain = ACTIVITY_DOMAIN_HCC_OPS;
const uint32_t op = 0;
const char * name = roctracer_op_string(domain, op, entry->kind);
const char* name = roctracer_op_string(domain, op, entry->kind);
if (name == NULL) {
printf("hip_act_flush_cb name is NULL\n"); fflush(stdout);
printf("hip_act_flush_cb name is NULL\n");
fflush(stdout);
abort();
}
if (strncmp("Kernel", name, 6) == 0) {
hip_kernel_mutex.lock();
if (hip_kernel_stats == NULL) {
printf("hip_act_flush_cb hip_kernel_stats is NULL\n"); fflush(stdout);
printf("hip_act_flush_cb hip_kernel_stats is NULL\n");
fflush(stdout);
abort();
}
name = (*hip_kernel_map)[entry->correlation_id];
@@ -640,11 +628,14 @@ void pool_activity_callback(const char* begin, const char* end, void* arg) {
const roctracer_record_t* end_record = reinterpret_cast<const roctracer_record_t*>(end);
while (record < end_record) {
const char * name = roctracer_op_string(record->domain, record->op, record->kind);
DEBUG_TRACE("pool_activity_callback(\"%s\"): domain(%u) op(%u) kind(%u) record(%p) correlation_id(%lu) beg(%lu) end(%lu)\n",
name, record->domain, record->op, record->kind, record, record->correlation_id, record->begin_ns, record->end_ns);
const char* name = roctracer_op_string(record->domain, record->op, record->kind);
DEBUG_TRACE(
"pool_activity_callback(\"%s\"): domain(%u) op(%u) kind(%u) record(%p) correlation_id(%lu) "
"beg(%lu) end(%lu)\n",
name, record->domain, record->op, record->kind, record, record->correlation_id,
record->begin_ns, record->end_ns);
switch(record->domain) {
switch (record->domain) {
case ACTIVITY_DOMAIN_HCC_OPS:
if (hip_memcpy_stats != NULL) {
hip_act_trace_entry_t* entry = hip_act_trace_buffer->GetEntry();
@@ -653,17 +644,16 @@ void pool_activity_callback(const char* begin, const char* end, void* arg) {
entry->correlation_id = record->correlation_id;
entry->valid.store(roctracer::TRACE_ENTRY_COMPL, std::memory_order_release);
} else {
fprintf(hcc_activity_file_handle, "%lu:%lu %d:%lu %s:%lu:%u\n",
record->begin_ns, record->end_ns,
record->device_id, record->queue_id,
name, record->correlation_id, my_pid);
fprintf(hcc_activity_file_handle, "%lu:%lu %d:%lu %s:%lu:%u\n", record->begin_ns,
record->end_ns, record->device_id, record->queue_id, name, record->correlation_id,
my_pid);
fflush(hcc_activity_file_handle);
}
break;
case ACTIVITY_DOMAIN_HSA_OPS:
if (record->op == HSA_OP_ID_RESERVED1) {
fprintf(pc_sample_file_handle, "%u %lu 0x%lx %s\n",
record->pc_sample.se, record->pc_sample.cycle, record->pc_sample.pc, name);
fprintf(pc_sample_file_handle, "%u %lu 0x%lx %s\n", record->pc_sample.se,
record->pc_sample.cycle, record->pc_sample.pc, name);
fflush(pc_sample_file_handle);
}
break;
@@ -682,11 +672,14 @@ std::string normalize_token(const std::string& token, bool not_empty, const std:
std::string error_str = "none";
if (first_pos != std::string::npos) {
const size_t last_pos = token.find_last_not_of(space_chars_set);
if (last_pos == std::string::npos) error_str = "token string error: \"" + token + "\"";
if (last_pos == std::string::npos)
error_str = "token string error: \"" + token + "\"";
else {
const size_t end_pos = last_pos + 1;
if (end_pos <= first_pos) error_str = "token string error: \"" + token + "\"";
else norm_len = end_pos - first_pos;
if (end_pos <= first_pos)
error_str = "token string error: \"" + token + "\"";
else
norm_len = end_pos - first_pos;
}
}
if (((first_pos != std::string::npos) && (norm_len == 0)) ||
@@ -696,7 +689,8 @@ std::string normalize_token(const std::string& token, bool not_empty, const std:
return (norm_len != 0) ? token.substr(first_pos, norm_len) : std::string("");
}
int get_xml_array(const xml::Xml::level_t* node, const std::string& field, const std::string& delim, std::vector<std::string>* vec, const char* label = NULL) {
int get_xml_array(const xml::Xml::level_t* node, const std::string& field, const std::string& delim,
std::vector<std::string>* vec, const char* label = NULL) {
int parse_iter = 0;
const auto& opts = node->opts;
auto it = opts.find(field);
@@ -706,7 +700,7 @@ int get_xml_array(const xml::Xml::level_t* node, const std::string& field, const
size_t pos1 = 0;
const size_t string_len = array_string.length();
while (pos1 < string_len) {
// set pos2 such that it also handles case of multiple delimiter options.
// set pos2 such that it also handles case of multiple delimiter options.
// For example- "hipLaunchKernel, hipExtModuleLaunchKernel, hipMemsetAsync"
// in this example delimiters are ' ' and also ','
const size_t pos2 = array_string.find_first_of(delim, pos1);
@@ -716,9 +710,9 @@ int get_xml_array(const xml::Xml::level_t* node, const std::string& field, const
const std::string norm_str = normalize_token(token, found, "get_xml_array");
if (norm_str.length() != 0) vec->push_back(norm_str);
if (!found) break;
// update pos2 such that it represents the first non-delimiter character
// in case multiple delimiters are specified in variable 'delim'
pos1 = array_string.find_first_not_of(delim, pos2);
// update pos2 such that it represents the first non-delimiter character
// in case multiple delimiters are specified in variable 'delim'
pos1 = array_string.find_first_not_of(delim, pos2);
++parse_iter;
}
}
@@ -742,7 +736,8 @@ FILE* open_output_file(const char* prefix, const char* name, const char** path =
}
if (path != NULL) *path = strdup(oss.str().c_str());
} else file_handle = stdout;
} else
file_handle = stdout;
return file_handle;
}
@@ -785,7 +780,7 @@ void tool_unload() {
flush_thread_started = false;
flush_thread_mutex.unlock();
PTHREAD_CALL(pthread_cancel(flush_thread));
void *res;
void* res;
PTHREAD_CALL(pthread_join(flush_thread, &res));
if (res != PTHREAD_CANCELED) FATAL("flush thread wasn't stopped correctly");
}
@@ -860,7 +855,8 @@ void tool_load() {
}
}
printf("ROCTracer (pid=%d): ", (int)GetPid()); fflush(stdout);
printf("ROCTracer (pid=%d): ", (int)GetPid());
fflush(stdout);
// XML input
const char* xml_name = getenv("ROCP_INPUT");
@@ -879,8 +875,10 @@ void tool_load() {
std::vector<std::string> api_vec;
for (const auto* node : entry->nodes) {
if (node->tag != "parameters") fatal("ROCTracer: trace node is not supported '" + name + ":" + node->tag + "'");
get_xml_array(node, "api", ", ", &api_vec); // delimiter options given as both spaces and commas (' ' and ',')
if (node->tag != "parameters")
fatal("ROCTracer: trace node is not supported '" + name + ":" + node->tag + "'");
get_xml_array(node, "api", ", ",
&api_vec); // delimiter options given as both spaces and commas (' ' and ',')
break;
}
@@ -917,14 +915,13 @@ void tool_load() {
roctx_file_handle = open_output_file(output_prefix, "roctx_trace.txt");
// initialize HSA tracing
roctracer_ext_properties_t properties {
start_callback,
stop_callback
};
roctracer_ext_properties_t properties{start_callback, stop_callback};
roctracer_set_properties(ACTIVITY_DOMAIN_EXT_API, &properties);
fprintf(stdout, " rocTX-trace()\n"); fflush(stdout);
ROCTRACER_CALL(roctracer_enable_domain_callback(ACTIVITY_DOMAIN_ROCTX, roctx_api_callback, NULL));
fprintf(stdout, " rocTX-trace()\n");
fflush(stdout);
ROCTRACER_CALL(
roctracer_enable_domain_callback(ACTIVITY_DOMAIN_ROCTX, roctx_api_callback, NULL));
}
const char* ctrl_str = getenv("ROCP_CTRL_RATE");
@@ -934,10 +931,13 @@ void tool_load() {
uint32_t ctrl_rate = 0;
if (sscanf(ctrl_str, "%d:%d:%d", &ctrl_delay, &ctrl_len, &ctrl_rate) != 3) {
EXC_RAISING(ROCTRACER_STATUS_ERROR, "Invalid ROCP_CTRL_RATE var(" << ctrl_str << "), expected ctrl_delay:ctrl_len:ctrl_rate");
EXC_RAISING(
ROCTRACER_STATUS_ERROR,
"Invalid ROCP_CTRL_RATE var(" << ctrl_str << "), expected ctrl_delay:ctrl_len:ctrl_rate");
}
if (ctrl_len > ctrl_rate) {
EXC_RAISING(ROCTRACER_STATUS_ERROR, "Control length value " << ctrl_len << " > rate value " << ctrl_rate);
EXC_RAISING(ROCTRACER_STATUS_ERROR,
"Control length value " << ctrl_len << " > rate value " << ctrl_rate);
}
control_dist_us = ctrl_rate - ctrl_len;
control_len_us = ctrl_len;
@@ -946,14 +946,21 @@ void tool_load() {
roctracer_stop();
if (ctrl_delay != UINT32_MAX) {
fprintf(stdout, "ROCTracer: trace control: delay(%uus), length(%uus), rate(%uus)\n", ctrl_delay, ctrl_len, ctrl_rate); fflush(stdout);
fprintf(stdout, "ROCTracer: trace control: delay(%uus), length(%uus), rate(%uus)\n",
ctrl_delay, ctrl_len, ctrl_rate);
fflush(stdout);
pthread_t thread;
pthread_attr_t attr;
int err = pthread_attr_init(&attr);
if (err) { errno = err; perror("pthread_attr_init"); abort(); }
if (err) {
errno = err;
perror("pthread_attr_init");
abort();
}
err = pthread_create(&thread, &attr, control_thr_fun, NULL);
} else {
fprintf(stdout, "ROCTracer: trace start disabled\n"); fflush(stdout);
fprintf(stdout, "ROCTracer: trace start disabled\n");
fflush(stdout);
}
}
@@ -965,10 +972,15 @@ void tool_load() {
abort();
}
fprintf(stdout, "ROCTracer: trace control flush rate(%uus)\n", control_flush_us); fflush(stdout);
fprintf(stdout, "ROCTracer: trace control flush rate(%uus)\n", control_flush_us);
fflush(stdout);
pthread_attr_t attr;
int err = pthread_attr_init(&attr);
if (err) { errno = err; perror("pthread_attr_init"); abort(); }
if (err) {
errno = err;
perror("pthread_attr_init");
abort();
}
std::lock_guard<std::mutex> lock(flush_thread_mutex);
PTHREAD_CALL(pthread_create(&flush_thread, &attr, flush_thr_fun, NULL));
flush_thread_started = true;
@@ -978,7 +990,8 @@ void tool_load() {
}
// HSA-runtime tool on-load method
extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count,
extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version,
uint64_t failed_tool_count,
const char* const* failed_tool_names) {
ONLOAD_TRACE_BEG();
@@ -998,17 +1011,20 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version,
// initialize HSA tracing
roctracer_set_properties(ACTIVITY_DOMAIN_HSA_API, (void*)table);
fprintf(stdout, " HSA-trace("); fflush(stdout);
fprintf(stdout, " HSA-trace(");
fflush(stdout);
if (hsa_api_vec().size() != 0) {
for (unsigned i = 0; i < hsa_api_vec().size(); ++i) {
uint32_t cid = HSA_API_ID_NUMBER;
const char* api = hsa_api_vec()[i].c_str();
ROCTRACER_CALL(roctracer_op_code(ACTIVITY_DOMAIN_HSA_API, api, &cid, NULL));
ROCTRACER_CALL(roctracer_enable_op_callback(ACTIVITY_DOMAIN_HSA_API, cid, hsa_api_callback, NULL));
ROCTRACER_CALL(
roctracer_enable_op_callback(ACTIVITY_DOMAIN_HSA_API, cid, hsa_api_callback, NULL));
printf(" %s", api);
}
} else {
ROCTRACER_CALL(roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HSA_API, hsa_api_callback, NULL));
ROCTRACER_CALL(
roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HSA_API, hsa_api_callback, NULL));
}
printf(")\n");
}
@@ -1018,21 +1034,20 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version,
hsa_async_copy_file_handle = open_output_file(output_prefix, "async_copy_trace.txt");
// initialize HSA tracing
roctracer::hsa_ops_properties_t ops_properties {
table,
reinterpret_cast<activity_async_callback_t>(hsa_activity_callback),
NULL,
output_prefix
};
roctracer::hsa_ops_properties_t ops_properties{
table, reinterpret_cast<activity_async_callback_t>(hsa_activity_callback), NULL,
output_prefix};
roctracer_set_properties(ACTIVITY_DOMAIN_HSA_OPS, &ops_properties);
fprintf(stdout, " HSA-activity-trace()\n"); fflush(stdout);
fprintf(stdout, " HSA-activity-trace()\n");
fflush(stdout);
ROCTRACER_CALL(roctracer_enable_op_activity(ACTIVITY_DOMAIN_HSA_OPS, HSA_OP_ID_COPY));
}
// Enable HIP API callbacks/activity
if (trace_hip_api || trace_hip_activity) {
fprintf(stdout, " HIP-trace()\n"); fflush(stdout);
fprintf(stdout, " HIP-trace()\n");
fflush(stdout);
// roctracer properties
roctracer_set_properties(ACTIVITY_DOMAIN_HIP_API, (void*)mark_api_callback);
// Allocating tracing pool
@@ -1052,11 +1067,13 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version,
uint32_t cid = HIP_API_ID_NONE;
const char* api = hip_api_vec()[i].c_str();
ROCTRACER_CALL(roctracer_op_code(ACTIVITY_DOMAIN_HIP_API, api, &cid, NULL));
ROCTRACER_CALL(roctracer_enable_op_callback(ACTIVITY_DOMAIN_HIP_API, cid, hip_api_callback, NULL));
ROCTRACER_CALL(
roctracer_enable_op_callback(ACTIVITY_DOMAIN_HIP_API, cid, hip_api_callback, NULL));
printf(" %s", api);
}
} else {
ROCTRACER_CALL(roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HIP_API, hip_api_callback, NULL));
ROCTRACER_CALL(
roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HIP_API, hip_api_callback, NULL));
}
if (is_stats_opt) {
@@ -1087,7 +1104,8 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version,
// Enable PC sampling
if (trace_pcs) {
fprintf(stdout, " PCS-trace()\n"); fflush(stdout);
fprintf(stdout, " PCS-trace()\n");
fflush(stdout);
open_tracing_pool();
pc_sample_file_handle = open_output_file(output_prefix, "pcs_trace.txt");
ROCTRACER_CALL(roctracer_enable_op_activity(ACTIVITY_DOMAIN_HSA_OPS, HSA_OP_ID_RESERVED1));
@@ -1103,17 +1121,19 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version,
}
// HSA-runtime on-unload method
extern "C" PUBLIC_API void OnUnload() {
ONLOAD_TRACE("");
}
extern "C" PUBLIC_API void OnUnload() { ONLOAD_TRACE(""); }
extern "C" CONSTRUCTOR_API void constructor() {
ONLOAD_TRACE_BEG();
roctracer::hip_support::HIP_depth_max = 0;
roctx_trace_buffer = new roctracer::TraceBuffer<roctx_trace_entry_t>("rocTX API", 0x200000, &roctx_flush_prm, 1);
hip_api_trace_buffer = new roctracer::TraceBuffer<hip_api_trace_entry_t>("HIP API", 0x200000, &hip_api_flush_prm, 1);
hip_act_trace_buffer = new roctracer::TraceBuffer<hip_act_trace_entry_t>("HIP ACT", 0x200000, &hip_act_flush_prm, 1, 1);
hsa_api_trace_buffer = new roctracer::TraceBuffer<hsa_api_trace_entry_t>("HSA API", 0x200000, &hsa_flush_prm, 1);
roctx_trace_buffer =
new roctracer::TraceBuffer<roctx_trace_entry_t>("rocTX API", 0x200000, &roctx_flush_prm, 1);
hip_api_trace_buffer =
new roctracer::TraceBuffer<hip_api_trace_entry_t>("HIP API", 0x200000, &hip_api_flush_prm, 1);
hip_act_trace_buffer = new roctracer::TraceBuffer<hip_act_trace_entry_t>(
"HIP ACT", 0x200000, &hip_act_flush_prm, 1, 1);
hsa_api_trace_buffer =
new roctracer::TraceBuffer<hsa_api_trace_entry_t>("HSA API", 0x200000, &hsa_flush_prm, 1);
roctracer_load();
tool_load();
ONLOAD_TRACE_END();