Change-Id: Iea47c25b3c9b6e1eadf097c34323727181975cca
Этот коммит содержится в:
Evgeny
2018-07-25 14:53:33 -05:00
родитель 2f83829f4f
Коммит 4d790c8eee
11 изменённых файлов: 384 добавлений и 281 удалений
+2 -2
Просмотреть файл
@@ -48,8 +48,8 @@ SOFTWARE.
#include <hsa_ven_amd_aqlprofile.h>
#include <stdint.h>
#define ROCPROFILER_VERSION_MAJOR 2
#define ROCPROFILER_VERSION_MINOR 1
#define ROCPROFILER_VERSION_MAJOR 3
#define ROCPROFILER_VERSION_MINOR 0
#ifdef __cplusplus
extern "C" {
+38 -57
Просмотреть файл
@@ -29,6 +29,9 @@ SOFTWARE.
#include <hsa.h>
#include <hsa_ext_amd.h>
#include <unistd.h> // usleep
#include <atomic>
#include <list>
#include <map>
#include <mutex>
#include <vector>
@@ -49,27 +52,7 @@ class Context;
inline unsigned align_size(unsigned size, unsigned alignment) {
return ((size + alignment - 1) & ~(alignment - 1));
}
#if 0
// Block descriptor
struct block_des_t {
uint32_t id;
uint32_t index;
};
// block_des_t less-then functor
struct lt_block_des {
bool operator()(const block_des_t& a1, const block_des_t& a2) const {
return (a1.id < a2.id) || ((a1.id == a2.id) && (a1.index < a2.index));
}
};
// Block status
struct block_status_t {
uint32_t max_counters;
uint32_t counter_index;
uint32_t group_index;
};
#endif
// Metrics arguments
template <class Map> class MetricArgs : public xml::args_cache_t {
public:
@@ -97,6 +80,9 @@ template <class Map> class MetricArgs : public xml::args_cache_t {
// Profiling group
class Group {
public:
typedef uint32_t refs_t;
typedef std::atomic<refs_t> atomic_refs_t;
Group(const util::AgentInfo* agent_info, Context* context, const uint32_t& index)
: pmc_profile_(agent_info),
sqtt_profile_(agent_info),
@@ -146,10 +132,10 @@ class Group {
Context* GetContext() { return context_; }
uint32_t GetIndex() const { return index_; }
void ResetRefs() { refs_ = n_profiles_; }
uint32_t DecrRefs() {
return (refs_ > 0) ? --refs_ : 0;
}
atomic_refs_t* AtomicRefsCount() { return reinterpret_cast<atomic_refs_t*>(&refs_); }
void ResetRefsCount() { AtomicRefsCount()->store(n_profiles_, std::memory_order_release); }
void IncrRefsCount() { AtomicRefsCount()->fetch_add(1, std::memory_order_acq_rel); }
uint32_t FetchDecrRefsCount() { return AtomicRefsCount()->fetch_sub(1, std::memory_order_acq_rel); }
private:
PmcProfile pmc_profile_;
@@ -159,7 +145,7 @@ class Group {
pkt_vector_t stop_vector_;
pkt_vector_t read_vector_;
uint32_t n_profiles_;
uint32_t refs_;
refs_t refs_;
Context* const context_;
const uint32_t index_;
};
@@ -167,7 +153,6 @@ class Group {
// Profiling context
class Context {
public:
typedef std::mutex mutex_t;
typedef std::map<std::string, rocprofiler_feature_t*> info_map_t;
Context(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info,
@@ -180,6 +165,8 @@ class Context {
handler_(handler),
handler_arg_(handler_arg)
{
if (info_count == 0) return;
metrics_ = MetricsDict::Create(agent_info);
if (metrics_ == NULL) EXC_RAISING(HSA_STATUS_ERROR, "MetricsDict create failed");
if (Initialize(info, info_count) == false) {
@@ -192,7 +179,7 @@ class Context {
if (handler != NULL) {
for (unsigned group_index = 0; group_index < set_.size(); ++group_index) {
set_[group_index].ResetRefs();
set_[group_index].ResetRefsCount();
const profile_vector_t profile_vector = GetProfiles(group_index);
for (auto& tuple : profile_vector) {
// Handler for stop packet completion
@@ -307,11 +294,11 @@ class Context {
}
}
void Reset(const uint32_t& group_index) { set_[group_index].ResetRefs(); }
void Reset(const uint32_t& group_index) { set_[group_index].ResetRefsCount(); }
uint32_t GetGroupCount() const { return set_.size(); }
rocprofiler_group_t GetGroupInfo(Group* g) {
inline rocprofiler_group_t GetGroupInfo(Group* g) {
rocprofiler::info_vector_t& info_vector = g->GetInfoVector();
rocprofiler_group_t group = {};
group.index = g->GetIndex();
@@ -320,8 +307,14 @@ class Context {
group.feature_count = info_vector.size();
return group;
}
rocprofiler_group_t GetGroupInfo(const uint32_t& index) {
return GetGroupInfo(&set_[index]);
inline rocprofiler_group_t GetGroupInfo(const uint32_t& index) {
rocprofiler_group_t group = {};
if (set_.empty()) {
group.context = reinterpret_cast<rocprofiler_t*>(this);
} else {
group = GetGroupInfo(&set_[index]);
}
return group;
}
const pkt_vector_t& StartPackets(const uint32_t& group_index) const {
@@ -368,14 +361,7 @@ class Context {
const profile_vector_t profile_vector = GetProfiles(group_index);
for (auto& tuple : profile_vector) {
// Wait for stop packet to complete
const uint64_t timeout = timeout_;
bool complete = false;
while (!complete) {
const hsa_signal_value_t signal_value = hsa_signal_wait_scacquire(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, timeout,
HSA_WAIT_STATE_BLOCKED);
complete = (signal_value < 1);
if (!complete) WARN_LOGGING("timeout");
}
hsa_rsrc_->SignalWaitRestore(tuple.completion_signal, 1);
for (rocprofiler_feature_t* rinfo : *(tuple.info_vector)) rinfo->data.kind = ROCPROFILER_DATA_KIND_UNINIT;
callback_data_t callback_data{tuple.profile, tuple.info_vector, tuple.info_vector->size(), NULL};
const hsa_status_t status =
@@ -411,8 +397,19 @@ class Context {
}
}
static void SetTimeout(uint64_t timeout) { timeout_ = timeout; }
static uint64_t GetTimeout() { return timeout_; }
static bool Handler(hsa_signal_value_t value, void* arg) {
Group* group = reinterpret_cast<Group*>(arg);
Context* context = group->GetContext();
auto r = group->FetchDecrRefsCount();
if (r == 1) {
const rocprofiler_group_t group_info = context->GetGroupInfo(group);
context->handler_(group_info, context->handler_arg_);
}
return false;
}
Group* GetGroup(const uint32_t& index) { return &set_[index]; }
rocprofiler_handler_t GetHandler(void** arg) const { *arg = handler_arg_; return handler_; }
private:
// Getting profling packets
@@ -425,18 +422,6 @@ class Context {
return vec;
}
static bool Handler(hsa_signal_value_t value, void* arg) {
Group* group = reinterpret_cast<Group*>(arg);
Context* context = group->GetContext();
context->mutex_.lock();
uint32_t r = group->DecrRefs();
context->mutex_.unlock();
if (r == 0) {
return context->handler_(context->GetGroupInfo(group), context->handler_arg_);
}
return false;
}
static hsa_status_t DataCallback(hsa_ven_amd_aqlprofile_info_type_t ainfo_type,
hsa_ven_amd_aqlprofile_info_data_t* ainfo_data, void* data) {
hsa_status_t status = HSA_STATUS_SUCCESS;
@@ -526,9 +511,6 @@ class Context {
return info;
}
// Profiling data waiting timeout
static uint64_t timeout_;
// GPU handel
const hsa_agent_t agent_;
const util::AgentInfo* agent_info_;
@@ -551,7 +533,6 @@ class Context {
// Context completion handler
rocprofiler_handler_t handler_;
void* handler_arg_;
mutex_t mutex_;
};
} // namespace rocprofiler
-1
Просмотреть файл
@@ -36,7 +36,6 @@ InterceptQueue::queue_callback_t InterceptQueue::destroy_callback_ = NULL;
void* InterceptQueue::callback_data_ = NULL;
InterceptQueue::obj_map_t* InterceptQueue::obj_map_ = NULL;
const char* InterceptQueue::kernel_none_ = "";
uint64_t InterceptQueue::timeout_ = UINT64_MAX;
Tracker* InterceptQueue::tracker_ = NULL;
bool InterceptQueue::tracker_on_ = false;
bool InterceptQueue::in_constr_call_ = false;
+47 -20
Просмотреть файл
@@ -50,6 +50,7 @@ class InterceptQueue {
typedef std::recursive_mutex mutex_t;
typedef std::map<uint64_t, InterceptQueue*> obj_map_t;
typedef hsa_status_t (*queue_callback_t)(hsa_queue_t*, void* data);
typedef void (*queue_event_callback_t)(hsa_status_t status, hsa_queue_t *queue, void *arg);
static void HsaIntercept(HsaApiTable* table);
@@ -65,12 +66,12 @@ class InterceptQueue {
if (in_constr_call_) EXC_ABORT(status, "recursive InterceptQueueCreate()");
in_constr_call_ = true;
ProxyQueue* proxy = ProxyQueue::Create(agent, size, type, callback, data, private_segment_size,
ProxyQueue* proxy = ProxyQueue::Create(agent, size, type, queue_event_callback, data, private_segment_size,
group_segment_size, queue, &status);
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "ProxyQueue::Create()");
if (tracker_on || tracker_on_) {
if (tracker_ == NULL) tracker_ = new Tracker(timeout_);
if (tracker_ == NULL) tracker_ = new Tracker;
status = hsa_amd_profiling_set_profiler_enabled(*queue, true);
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_amd_profiling_set_profiler_enabled()");
}
@@ -79,6 +80,7 @@ class InterceptQueue {
InterceptQueue* obj = new InterceptQueue(agent, *queue, proxy);
(*obj_map_)[(uint64_t)(*queue)] = obj;
status = proxy->SetInterceptCB(OnSubmitCB, obj);
obj->queue_event_callback_ = callback;
in_constr_call_ = false;
return status;
@@ -132,11 +134,10 @@ class InterceptQueue {
reinterpret_cast<const hsa_kernel_dispatch_packet_t*>(packet);
// Adding kernel timing tracker
const rocprofiler_dispatch_record_t* record = NULL;
Tracker::entry_t* tracker_entry = NULL;
if (tracker_ != NULL) {
const auto* entry = tracker_->Add(obj->agent_info_->dev_id, dispatch_packet->completion_signal);
const_cast<hsa_kernel_dispatch_packet_t*>(dispatch_packet)->completion_signal = entry->signal;
record = entry->record;
tracker_entry = tracker_->Alloc(obj->agent_info_->dev_id, dispatch_packet->completion_signal);
const_cast<hsa_kernel_dispatch_packet_t*>(dispatch_packet)->completion_signal = tracker_entry->signal;
}
// Prepareing dispatch callback data
@@ -147,27 +148,44 @@ class InterceptQueue {
user_que_idx,
dispatch_packet,
kernel_name,
record};
(tracker_entry) ? tracker_entry->record : NULL};
// Calling dispatch callback
rocprofiler_group_t group = {};
hsa_status_t status = dispatch_callback_(&data, callback_data_, &group);
free(const_cast<char*>(kernel_name));
// Injecting profiling start/stop packets
if ((status == HSA_STATUS_SUCCESS) && (group.context != NULL)) {
if ((status != HSA_STATUS_SUCCESS) || (group.context == NULL)) {
if (tracker_entry != NULL) tracker_->Delete(tracker_entry);
} else {
Context* context = reinterpret_cast<Context*>(group.context);
const pkt_vector_t& start_vector = context->StartPackets(group.index);
const pkt_vector_t& stop_vector = context->StopPackets(group.index);
pkt_vector_t packets = start_vector;
packets.insert(packets.end(), *packet);
packets.insert(packets.end(), stop_vector.begin(), stop_vector.end());
if (writer != NULL) {
writer(&packets[0], packets.size());
if (group.feature_count != 0) {
if (tracker_entry != NULL) {
Group* context_group = context->GetGroup(group.index);
context_group->IncrRefsCount();
tracker_->Enable(tracker_entry, Context::Handler, reinterpret_cast<void*>(context_group));
}
const pkt_vector_t& start_vector = context->StartPackets(group.index);
const pkt_vector_t& stop_vector = context->StopPackets(group.index);
pkt_vector_t packets = start_vector;
packets.insert(packets.end(), *packet);
packets.insert(packets.end(), stop_vector.begin(), stop_vector.end());
if (writer != NULL) {
writer(&packets[0], packets.size());
} else {
proxy->Submit(&packets[0], packets.size());
}
to_submit = false;
} else {
proxy->Submit(&packets[0], packets.size());
if (tracker_entry != NULL) {
void* context_handler_arg = NULL;
rocprofiler_handler_t context_handler_fun = context->GetHandler(&context_handler_arg);
tracker_->Enable(tracker_entry, context_handler_fun, context_handler_arg);
rocprofiler_close(context);
}
}
to_submit = false;
}
}
@@ -189,11 +207,16 @@ class InterceptQueue {
destroy_callback_ = destroy_callback;
}
static void SetTimeout(uint64_t timeout) { timeout_ = timeout; }
static void TrackerOn(bool on) { tracker_on_ = on; }
static bool IsTrackerOn() { return tracker_on_; }
private:
static void queue_event_callback(hsa_status_t status, hsa_queue_t *queue, void *arg) {
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "queue error handling is not supported");
InterceptQueue* obj = GetObj(queue);
if (obj->queue_event_callback_) obj->queue_event_callback_(status, obj->queue_, arg);
}
static hsa_packet_type_t GetHeaderType(const packet_t* packet) {
const packet_word_t* header = reinterpret_cast<const packet_word_t*>(packet);
return static_cast<hsa_packet_type_t>((*header >> HSA_PACKET_HEADER_TYPE) & header_type_mask);
@@ -258,8 +281,12 @@ class InterceptQueue {
proxy_(proxy)
{
agent_info_ = util::HsaRsrcFactory::Instance().GetAgentInfo(agent);
queue_event_callback_ = NULL;
}
~InterceptQueue() {
ProxyQueue::Destroy(proxy_);
}
~InterceptQueue() { ProxyQueue::Destroy(proxy_); }
static mutex_t mutex_;
static const packet_word_t header_type_mask = (1ul << HSA_PACKET_HEADER_WIDTH_TYPE) - 1;
@@ -268,7 +295,6 @@ class InterceptQueue {
static void* callback_data_;
static obj_map_t* obj_map_;
static const char* kernel_none_;
static uint64_t timeout_;
static Tracker* tracker_;
static bool tracker_on_;
static bool in_constr_call_;
@@ -276,6 +302,7 @@ class InterceptQueue {
hsa_queue_t* const queue_;
ProxyQueue* const proxy_;
const util::AgentInfo* agent_info_;
queue_event_callback_t queue_event_callback_;
};
} // namespace rocprofiler
+7 -6
Просмотреть файл
@@ -154,7 +154,7 @@ bool LoadTool() {
settings.intercept_mode = (intercept_mode) ? 1 : 0;
settings.sqtt_size = SqttProfile::GetSize();
settings.sqtt_local = SqttProfile::IsLocal() ? 1: 0;
settings.timeout = Context::GetTimeout();
settings.timeout = util::HsaRsrcFactory::GetTimeoutNs();
settings.timestamp_on = InterceptQueue::IsTrackerOn() ? 1 : 0;
if (handler) handler();
@@ -163,8 +163,7 @@ bool LoadTool() {
intercept_mode = (settings.intercept_mode != 0);
SqttProfile::SetSize(settings.sqtt_size);
SqttProfile::SetLocal(settings.sqtt_local != 0);
Context::SetTimeout(settings.timeout);
InterceptQueue::SetTimeout(settings.timeout);
util::HsaRsrcFactory::SetTimeoutNs(settings.timeout);
InterceptQueue::TrackerOn(settings.timestamp_on != 0);
}
@@ -190,8 +189,8 @@ CONSTRUCTOR_API void constructor() {
}
DESTRUCTOR_API void destructor() {
util::HsaRsrcFactory::Destroy();
rocprofiler::MetricsDict::Destroy();
util::HsaRsrcFactory::Destroy();
util::Logger::Destroy();
}
@@ -213,10 +212,12 @@ hsa_status_t GetExcStatus(const std::exception& e) {
}
rocprofiler_properties_t rocprofiler_properties;
uint64_t Context::timeout_ = UINT64_MAX;
#if DEFERRED_HANDLERS
Context::deferred_handler_list_t* Context::deferred_handler_list_ = NULL;
Context::recur_mutex_t Context::mutex_static_;
#endif
uint32_t SqttProfile::output_buffer_size_ = 0x2000000; // 32M
bool SqttProfile::output_buffer_local_ = true;
Tracker::mutex_t Tracker::mutex_;
util::Logger::mutex_t util::Logger::mutex_;
util::Logger* util::Logger::instance_ = NULL;
}
+103 -79
Просмотреть файл
@@ -30,9 +30,11 @@ SOFTWARE.
#include <hsa.h>
#include <hsa_ext_amd.h>
#include <atomic>
#include <list>
#include <mutex>
#include "util/hsa_rsrc_factory.h"
#include "inc/rocprofiler.h"
#include "util/exception.h"
#include "util/logger.h"
@@ -41,12 +43,13 @@ namespace rocprofiler {
class Tracker {
public:
typedef uint64_t timestamp_t;
typedef long double freq_t;
typedef std::mutex mutex_t;
typedef util::HsaRsrcFactory::timestamp_t timestamp_t;
typedef rocprofiler_dispatch_record_t record_t;
struct entry_t;
typedef std::list<entry_t*> sig_list_t;
typedef sig_list_t::iterator sig_list_it_t;
struct entry_t {
Tracker* tracker;
sig_list_t::iterator it;
@@ -54,75 +57,59 @@ class Tracker {
hsa_signal_t orig;
hsa_signal_t signal;
record_t* record;
std::atomic<void*> handler;
void* arg;
bool context_active;
};
Tracker(uint64_t timeout = UINT64_MAX) : timeout_(timeout), outstanding(0) {
timestamp_t timestamp_hz = 0;
hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &timestamp_hz);
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY)");
timestamp_factor_ = (freq_t)1000000000 / (freq_t)timestamp_hz;
}
Tracker() :
outstanding_(0),
hsa_rsrc_(&(util::HsaRsrcFactory::Instance()))
{}
~Tracker() {
mutex_.lock();
for (entry_t* entry : sig_list_) {
assert(entry != NULL);
while (1) {
const hsa_signal_value_t signal_value = hsa_signal_wait_scacquire(
entry->signal,
HSA_SIGNAL_CONDITION_LT,
1,
timeout_,
HSA_WAIT_STATE_BLOCKED);
if (signal_value < 1) break;
else WARN_LOGGING("tracker timeout");
}
Del(entry);
auto it = sig_list_.begin();
auto end = sig_list_.end();
while (it != end) {
auto cur = it++;
hsa_rsrc_->SignalWait((*cur)->signal);
Erase(cur);
}
mutex_.unlock();
}
// Add tracker entry
entry_t* Add(const hsa_agent_t& agent, const hsa_signal_t& orig) {
entry_t* Alloc(const hsa_agent_t& agent, const hsa_signal_t& orig) {
hsa_status_t status = HSA_STATUS_ERROR;
// Creating a new tracker entry
entry_t* entry = new entry_t{};
assert(entry);
entry->tracker = this;
entry->agent = agent;
entry->orig = orig;
// Creating a record with the dispatch timestamps
record_t* record = new record_t{};
assert(record);
record->dispatch = hsa_rsrc_->TimestampNs();
entry->record = record;
// Creating a proxy signal
status = hsa_signal_create(1, 0, NULL, &(entry->signal));
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create");
status = hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler");
// Adding antry to the list
mutex_.lock();
entry->it = sig_list_.insert(sig_list_.begin(), entry);
mutex_.unlock();
entry->agent = agent;
entry->orig = orig;
status = hsa_signal_create(1, 0, NULL, &(entry->signal));
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create");
record_t* record = new record_t{};
assert(record);
entry->record = record;
timestamp_t dispatch_timestamp = 0;
status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &dispatch_timestamp);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)");
record->dispatch = timestamp2ns(dispatch_timestamp);
status = hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler");
if (trace_on_) {
mutex_.lock();
entry->tracker->outstanding++;
fprintf(stdout, "Tracker::Add: entry %p, record %p, outst %lu\n", entry, entry->record, entry->tracker->outstanding);
fflush(stdout);
mutex_.unlock();
}
return entry;
}
private:
// Delete tracker entry
void Del(entry_t* entry) {
void Delete(entry_t* entry) {
hsa_signal_destroy(entry->signal);
mutex_.lock();
sig_list_.erase(entry->it);
@@ -130,31 +117,53 @@ class Tracker {
delete entry;
}
// Handler for packet completion
static bool Handler(hsa_signal_value_t value, void* arg) {
entry_t* entry = reinterpret_cast<entry_t*>(arg);
// Enable tracker entry
void Enable(entry_t* entry, void* handler, void* arg) {
// Set entry handler and release the entry
entry->arg = arg;
entry->handler.store(handler, std::memory_order_release);
// Debug trace
if (trace_on_) {
auto outstanding = outstanding_.fetch_add(1);
fprintf(stdout, "Tracker::Add: entry %p, record %p, outst %lu\n", entry, entry->record, outstanding);
fflush(stdout);
}
}
void Enable(entry_t* entry, hsa_amd_signal_handler handler, void* arg) {
entry->context_active = true;
Enable(entry, reinterpret_cast<void*>(handler), arg);
}
void Enable(entry_t* entry, rocprofiler_handler_t handler, void* arg) {
Enable(entry, reinterpret_cast<void*>(handler), arg);
}
private:
// Delete an entry by iterator
void Erase(const sig_list_it_t& it) { Delete(*it); }
// Entry completion
void Complete(entry_t* entry) {
record_t* record = entry->record;
// Debug trace
if (trace_on_) {
mutex_.lock();
entry->tracker->outstanding--;
fprintf(stdout, "Tracker::Handler: entry %p, record %p, outst %lu\n", entry, entry->record, entry->tracker->outstanding);
auto outstanding = outstanding_.fetch_sub(1);
fprintf(stdout, "Tracker::Handler: entry %p, record %p, outst %lu\n", entry, entry->record, outstanding);
fflush(stdout);
mutex_.unlock();
}
timestamp_t complete_timestamp = 0;
// Query begin/end and complete timestamps
hsa_amd_profiling_dispatch_time_t dispatch_time{};
hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &complete_timestamp);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)");
status = hsa_amd_profiling_get_dispatch_time(entry->agent, entry->signal, &dispatch_time);
hsa_status_t status = hsa_amd_profiling_get_dispatch_time(entry->agent, entry->signal, &dispatch_time);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_profiling_get_dispatch_time");
record->complete = entry->tracker->timestamp2ns(complete_timestamp);
record->begin = entry->tracker->timestamp2ns(dispatch_time.start);
record->end = entry->tracker->timestamp2ns(dispatch_time.end);
record->begin = hsa_rsrc_->SysclockToNs(dispatch_time.start);
record->end = hsa_rsrc_->SysclockToNs(dispatch_time.end);
record->complete = hsa_rsrc_->TimestampNs();
// Original intercepted signal completion
hsa_signal_t orig = entry->orig;
if (orig.handle) {
amd_signal_t* orig_signal_ptr = reinterpret_cast<amd_signal_t*>(orig.handle);
@@ -165,26 +174,41 @@ class Tracker {
const hsa_signal_value_t value = hsa_signal_load_relaxed(orig);
hsa_signal_store_screlease(orig, value - 1);
}
entry->tracker->Del(entry);
}
// Handler for packet completion
static bool Handler(hsa_signal_value_t, void* arg) {
// Acquire entry
entry_t* entry = reinterpret_cast<entry_t*>(arg);
volatile std::atomic<void*>* ptr = &entry->handler;
while (ptr->load(std::memory_order_acquire) == NULL) sched_yield();
// Complete entry
entry->tracker->Complete(entry);
// Call entry handler
void* handler = static_cast<void*>(entry->handler);
if (entry->context_active) {
reinterpret_cast<hsa_amd_signal_handler>(handler)(0, entry->arg);
} else {
rocprofiler_group_t group{};
reinterpret_cast<rocprofiler_handler_t>(handler)(group, entry->arg);
}
// Delete tracker entry
entry->tracker->Delete(entry);
return false;
}
inline timestamp_t timestamp2ns(const timestamp_t& timestamp) const {
const freq_t timestamp_ns = (freq_t)timestamp * timestamp_factor_;
return (timestamp_t)timestamp_ns;
}
// Timestamp frequency factor
freq_t timestamp_factor_;
// Timeout for wait on destruction
timestamp_t timeout_;
// Tracked signals list
sig_list_t sig_list_;
// Inter-thread synchronization
static mutex_t mutex_;
mutex_t mutex_;
// Outstanding dispatches
uint64_t outstanding;
std::atomic<uint64_t> outstanding_;
// HSA resources factory
util::HsaRsrcFactory* hsa_rsrc_;
// Enable tracing
static const bool trace_on_ = false;
};
+1
Просмотреть файл
@@ -34,6 +34,7 @@ typedef hsa_ven_amd_aqlprofile_parameter_t parameter_t;
typedef hsa_ven_amd_aqlprofile_profile_t profile_t;
typedef hsa_ext_amd_aql_pm4_packet_t packet_t;
typedef uint32_t packet_word_t;
typedef uint64_t timestamp_t;
} // namespace rocprofiler
#endif // SRC_CORE_TYPES_H_
+41 -10
Просмотреть файл
@@ -44,6 +44,9 @@ SOFTWARE.
#include <string>
#include <vector>
#include "util/exception.h"
#include "util/logger.h"
namespace rocprofiler {
namespace util {
@@ -110,11 +113,13 @@ hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) {
// Constructor of the class
HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize_hsa) {
hsa_status_t status;
// Initialize the Hsa Runtime
if (initialize_hsa_) {
status = hsa_init();
CHECK_STATUS("Error in hsa_init", status);
}
// Discover the set of Gpu devices available on the platform
status = hsa_iterate_agents(GetHsaAgentsCallback, this);
CHECK_STATUS("Error Calling hsa_iterate_agents", status);
@@ -132,10 +137,19 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize
loader_api_ = {0};
status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 0, &loader_api_);
CHECK_STATUS("loader API table query failed", status);
// Instantiate HSA timer
timer_ = new HsaTimer;
CHECK_STATUS("HSA timer allocation failed",
(timer_ == NULL) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS);
// System timeout
timeout_ = (timeout_ns_ == HsaTimer::TIMESTAMP_MAX) ? timeout_ns_ : timer_->ns_to_sysclock(timeout_ns_);
}
// Destructor of the class
HsaRsrcFactory::~HsaRsrcFactory() {
delete timer_;
for (auto p : cpu_list_) delete p;
for (auto p : gpu_list_) delete p;
if (initialize_hsa_) {
@@ -403,22 +417,38 @@ uint8_t* HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t s
return ptr;
}
// Wait signal
void HsaRsrcFactory::SignalWait(const hsa_signal_t& signal) const {
while (1) {
const hsa_signal_value_t signal_value =
hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, timeout_, HSA_WAIT_STATE_BLOCKED);
if (signal_value == 0) {
break;
} else {
if (signal_value == 1) WARN_LOGGING("signal waiting...");
else EXC_RAISING(HSA_STATUS_ERROR, "hsa_signal_wait_scacquire (" << signal_value << ")");
}
}
}
// Wait signal with signal value restore
void HsaRsrcFactory::SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const {
SignalWait(signal);
hsa_signal_store_relaxed(const_cast<hsa_signal_t&>(signal), signal_value);
}
// Copy data from GPU to host memory
bool HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size) {
hsa_status_t status = HSA_STATUS_ERROR;
if (!cpu_agents_.empty()) {
hsa_signal_t s = {};
status = hsa_signal_create(1, 0, NULL, &s);
if (status == HSA_STATUS_SUCCESS) {
status = hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s);
if (status == HSA_STATUS_SUCCESS) {
if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
HSA_WAIT_STATE_BLOCKED) != 0) {
status = HSA_STATUS_ERROR;
}
}
status = hsa_signal_destroy(s);
}
CHECK_STATUS("hsa_signal_create()", status);
status = hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s);
CHECK_STATUS("hsa_amd_memory_async_copy()", status);
SignalWait(s);
status = hsa_signal_destroy(s);
CHECK_STATUS("hsa_signal_destroy()", status);
}
return (status == HSA_STATUS_SUCCESS);
}
@@ -561,6 +591,7 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t s
HsaRsrcFactory* HsaRsrcFactory::instance_ = NULL;
HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_;
HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MAX;
} // namespace util
} // namespace rocprofiler
+59
Просмотреть файл
@@ -118,9 +118,42 @@ struct AgentInfo {
uint32_t shader_arrays_per_se;
};
// HSA timer class
// Provides current HSA timestampa and system-clock/ns conversion API
class HsaTimer {
public:
typedef uint64_t timestamp_t;
static const timestamp_t TIMESTAMP_MAX = UINT64_MAX;
typedef long double freq_t;
HsaTimer() {
timestamp_t sysclock_hz = 0;
hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz);
CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY)", status);
sysclock_factor_ = (freq_t)1000000000 / (freq_t)sysclock_hz;
}
// Methids for system-clock/ns conversion
timestamp_t sysclock_to_ns(const timestamp_t& sysclock) const { return timestamp_t((freq_t)sysclock * sysclock_factor_); }
timestamp_t ns_to_sysclock(const timestamp_t& time) const { return timestamp_t((freq_t)time / sysclock_factor_); }
// Return timestamp in 'ns'
timestamp_t timestamp_ns() const {
timestamp_t sysclock;
hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &sysclock);
CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)", status);
return sysclock_to_ns(sysclock);
}
private:
// Timestamp frequency factor
freq_t sysclock_factor_;
};
class HsaRsrcFactory {
public:
typedef std::recursive_mutex mutex_t;
typedef HsaTimer::timestamp_t timestamp_t;
static HsaRsrcFactory* Create(bool initialize_hsa = true) {
std::lock_guard<mutex_t> lck(mutex_);
@@ -206,6 +239,12 @@ class HsaRsrcFactory {
// @return uint8_t* Pointer to buffer, null if allocation fails.
uint8_t* AllocateCmdMemory(const AgentInfo* agent_info, size_t size);
// Wait signal
void SignalWait(const hsa_signal_t& signal) const;
// Wait signal with signal value restore
void SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const;
// Copy data from GPU to host memory
bool Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size);
bool Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size);
@@ -237,6 +276,18 @@ class HsaRsrcFactory {
// Return Loader API table
const hsa_ven_amd_loader_1_00_pfn_t* LoaderApi() const { return &loader_api_; }
// Methods for system-clock/ns conversion and timestamp in 'ns'
timestamp_t SysclockToNs(const timestamp_t& sysclock) const { return timer_->sysclock_to_ns(sysclock); }
timestamp_t NsToSysclock(const timestamp_t& time) const { return timer_->ns_to_sysclock(time); }
timestamp_t TimestampNs() const { return timer_->timestamp_ns(); }
timestamp_t GetSysTimeout() const { return timeout_; }
static timestamp_t GetTimeoutNs() { return timeout_ns_; }
static void SetTimeoutNs(const timestamp_t& time) {
std::lock_guard<mutex_t> lck(mutex_);
timeout_ns_ = time;
if (instance_ != NULL) instance_->timeout_ = instance_->timer_->ns_to_sysclock(time);
}
private:
// System agents iterating callback
static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data);
@@ -282,6 +333,14 @@ class HsaRsrcFactory {
// Loader API table
hsa_ven_amd_loader_1_00_pfn_t loader_api_;
// System timeout, ns
static timestamp_t timeout_ns_;
// System timeout, sysclock
timestamp_t timeout_;
// HSA timer
HsaTimer* timer_;
};
} // namespace util
+9 -9
Просмотреть файл
@@ -191,33 +191,33 @@ class Logger {
} // namespace rocprofiler
#define ERR_LOGGING(stream) \
{ \
do { \
rocprofiler::util::Logger::Instance() << rocprofiler::util::Logger::errm \
<< "error: " << rocprofiler::util::Logger::begm \
<< stream << rocprofiler::util::Logger::endl; \
}
} while(0)
#define INFO_LOGGING(stream) \
{ \
do { \
rocprofiler::util::Logger::Instance() << "info: " << rocprofiler::util::Logger::begm << stream \
<< rocprofiler::util::Logger::endl; \
}
} while(0)
#define WARN_LOGGING(stream) \
{ \
std::cerr << "ROCProfiler: " << stream << std::endl; \
do { \
std::cerr << "ROCProfiler: " << stream << std::endl; \
rocprofiler::util::Logger::Instance() << "warning: " << rocprofiler::util::Logger::begm << stream \
<< rocprofiler::util::Logger::endl; \
}
} while(0)
#ifdef DEBUG
#define DBG_LOGGING(stream) \
{ \
do { \
rocprofiler::util::Logger::Instance() << rocprofiler::util::Logger::begm << "debug: \"" \
<< stream << "\"" < < < < \
" in " << __FUNCTION__ << " at " << __FILE__ << " line " << __LINE__ \
<< rocprofiler::util::Logger::endl; \
}
} while(0)
#endif
#endif // SRC_UTIL_LOGGER_H_
+77 -97
Просмотреть файл
@@ -39,11 +39,14 @@ SOFTWARE.
#include <sys/types.h>
#include <unistd.h>
#include <atomic>
#include <chrono>
#include <iostream>
#include <list>
#include <map>
#include <sstream>
#include <string>
#include <thread>
#include <vector>
#include "inc/rocprofiler.h"
@@ -70,7 +73,8 @@ struct callbacks_data_t {
// Context stored entry type
struct context_entry_t {
uint32_t valid;
bool valid;
bool active;
uint32_t index;
hsa_agent_t agent;
rocprofiler_group_t group;
@@ -95,8 +99,6 @@ callbacks_data_t* callbacks_data = NULL;
// Stored contexts array
typedef std::map<uint32_t, context_entry_t> context_array_t;
context_array_t* context_array = NULL;
typedef std::list<context_entry_t*> wait_list_t;
wait_list_t* wait_list = NULL;
// Contexts collected count
volatile uint32_t context_count = 0;
volatile uint32_t context_collected = 0;
@@ -171,9 +173,11 @@ std::string filtr_kernel_name(const std::string name) {
}
++rit;
}
while (((*rit == ' ') || (*rit == ' ')) && (rit != rend)) rit++;
// while (((*rit == ' ') || (*rit == ' ')) && (rit != rend)) rit++;
while (rit != rend) if ((*rit == ' ') || (*rit == ' ')) rit++; else break;
auto rbeg = rit;
while ((*rit != ' ') && (*rit != ':') && (rit != rend)) rit++;
// while ((*rit != ' ') && (*rit != ':') && (rit != rend)) rit++;
while (rit != rend) if ((*rit != ' ') && (*rit != ':')) rit++; else break;
const uint32_t pos = rend - rit;
const uint32_t length = rit - rbeg;
return name.substr(pos, length);
@@ -384,11 +388,12 @@ void output_group(const context_entry_t* entry, const char* label) {
}
}
// Dump stored context profiling output data
bool dump_context(context_entry_t* entry) {
// Dump stored context entry
bool dump_context_entry(context_entry_t* entry) {
hsa_status_t status = HSA_STATUS_ERROR;
if (entry->valid == 0) return true;
volatile std::atomic<bool>* valid = reinterpret_cast<std::atomic<bool>*>(&entry->valid);
while (valid->load() == false) sched_yield();
const rocprofiler_dispatch_record_t* record = entry->data.record;
if (record) {
@@ -438,65 +443,48 @@ bool dump_context(context_entry_t* entry) {
rocprofiler_close(group.context);
}
entry->valid = 0;
return true;
}
// Dump and clean a given context entry
static inline bool dump_context_entry(context_entry_t* entry) {
const bool ret = dump_context(entry);
if (ret) dealloc_context_entry(entry);
return ret;
}
// Dump waiting entries
static inline void dump_wait_list() {
if (pthread_mutex_lock(&mutex) != 0) {
perror("pthread_mutex_lock");
abort();
}
auto it = wait_list->begin();
auto end = wait_list->end();
while (it != end) {
auto cur = it++;
if (dump_context_entry(*cur)) {
wait_list->erase(cur);
// Wait for and dump all stored contexts for a given queue if not NULL
void dump_context_array(hsa_queue_t* queue) {
bool done = false;
while (done == false) {
done = true;
if (pthread_mutex_lock(&mutex) != 0) {
perror("pthread_mutex_lock");
abort();
}
}
if (pthread_mutex_unlock(&mutex) != 0) {
perror("pthread_mutex_unlock");
abort();
}
}
// Dump all stored contexts profiling output data
void dump_context_array() {
if (pthread_mutex_lock(&mutex) != 0) {
perror("pthread_mutex_lock");
abort();
}
if (context_array) {
if (!wait_list->empty()) dump_wait_list();
auto it = context_array->begin();
auto end = context_array->end();
while (it != end) {
auto cur = it++;
dump_context(&(cur->second));
if (context_array) {
auto it = context_array->begin();
auto end = context_array->end();
while (it != end) {
auto cur = it++;
context_entry_t* entry = &(cur->second);
volatile std::atomic<bool>* valid = reinterpret_cast<std::atomic<bool>*>(&entry->valid);
while (valid->load() == false) sched_yield();
if ((queue == NULL) || (entry->data.queue == queue)) {
if (entry->active == true) {
if (dump_context_entry(&(cur->second)) == false) done = false;
else entry->active = false;
}
}
}
if (pthread_mutex_unlock(&mutex) != 0) {
perror("pthread_mutex_unlock");
abort();
}
if (done == false) sched_yield();
}
}
if (pthread_mutex_unlock(&mutex) != 0) {
perror("pthread_mutex_unlock");
abort();
}
}
// Profiling completion handler
bool handler(rocprofiler_group_t group, void* arg) {
// Dump and delete the context entry
// Return true if the context was dumped successfully
bool context_handler(rocprofiler_group_t group, void* arg) {
context_entry_t* entry = reinterpret_cast<context_entry_t*>(arg);
if (pthread_mutex_lock(&mutex) != 0) {
@@ -504,11 +492,15 @@ bool handler(rocprofiler_group_t group, void* arg) {
abort();
}
if (!wait_list->empty()) dump_wait_list();
if (!dump_context_entry(entry)) {
wait_list->push_back(entry);
bool ret = true;
if (entry->active == true) {
ret = dump_context_entry(entry);
if (ret == false) {
fprintf(stderr, "tool error: context is not complete\n");
abort();
}
}
if (ret) dealloc_context_entry(entry);
if (trace_on) {
fprintf(stdout, "tool::handler: context_array %d tid %u\n", (int)(context_array->size()), GetTid());
@@ -579,7 +571,7 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data,
context_entry_t* entry = alloc_context_entry();
// context properties
rocprofiler_properties_t properties{};
properties.handler = (result_prefix != NULL) ? handler : NULL;
properties.handler = (result_prefix != NULL) ? context_handler : NULL;
properties.handler_arg = (void*)entry;
rocprofiler_feature_t* features = tool_data->features;
@@ -600,22 +592,20 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data,
feature_count = next_offset - set_offset;
}
if (feature_count > 0) {
// Open profiling context
status = rocprofiler_open(callback_data->agent, features, feature_count,
&context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties);
check_status(status);
// Open profiling context
status = rocprofiler_open(callback_data->agent, features, feature_count,
&context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties);
check_status(status);
// Check that we have only one profiling group
uint32_t group_count = 0;
status = rocprofiler_group_count(context, &group_count);
check_status(status);
assert(group_count == 1);
// Get group[0]
const uint32_t group_index = 0;
status = rocprofiler_get_group(context, group_index, group);
check_status(status);
}
// Check that we have only one profiling group
uint32_t group_count = 0;
status = rocprofiler_group_count(context, &group_count);
check_status(status);
assert(group_count == 1);
// Get group[0]
const uint32_t group_index = 0;
status = rocprofiler_get_group(context, group_index, group);
check_status(status);
// Fill profiling context entry
entry->agent = callback_data->agent;
@@ -625,7 +615,8 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data,
entry->data = *callback_data;
entry->data.kernel_name = strdup(callback_data->kernel_name);
entry->file_handle = tool_data->file_handle;
entry->valid = 1;
entry->active = true;
reinterpret_cast<std::atomic<bool>*>(&entry->valid)->store(true);
if (trace_on) {
fprintf(stdout, "tool::dispatch: context_array %d tid %u\n", (int)(context_array->size()), GetTid());
@@ -637,7 +628,7 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data,
hsa_status_t destroy_callback(hsa_queue_t* queue, void*) {
if (result_file_opened == false) printf("\nROCProfiler results:\n");
dump_context_array();
dump_context_array(queue);
return HSA_STATUS_SUCCESS;
}
@@ -969,7 +960,6 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings)
// Context array aloocation
context_array = new context_array_t;
wait_list = new wait_list_t;
// Adding dispatch observer
rocprofiler_queue_callbacks_t callbacks_ptrs{0};
@@ -1020,21 +1010,13 @@ extern "C" PUBLIC_API void OnUnloadTool() {
rocprofiler_remove_queue_callbacks();
// Dump stored profiling output data
printf("\nROCPRofiler: %u contexts collected", context_collected);
if (result_file_opened) printf(", output directory %s", result_prefix);
printf("\n"); fflush(stdout);
dump_context_array();
if (wait_list) {
if (!wait_list->empty()) {
printf("\nWaiting for pending kernels ..."); fflush(stdout);
while (wait_list->size() != 0) {
usleep(1000);
dump_wait_list();
}
printf(".done\n"); fflush(stdout);
}
printf("\nROCPRofiler: %u contexts collected", context_collected); fflush(stdout);
dump_context_array(NULL);
if (result_file_opened) {
fclose(result_file_handle);
printf(", output directory %s", result_prefix);
}
if (result_file_opened) fclose(result_file_handle);
printf("\n"); fflush(stdout);
// Cleanup
if (callbacks_data != NULL) {
@@ -1052,8 +1034,6 @@ extern "C" PUBLIC_API void OnUnloadTool() {
range_vec = NULL;
delete context_array;
context_array = NULL;
delete wait_list;
wait_list = NULL;
}
extern "C" DESTRUCTOR_API void destructor() {