Files
rocm-systems/test/tool/tool.cpp
T
Ammar ELWazir d4a33cf33a Pull from Github
Squashed commit of the following:

commit f029195705a15700380c6f832ba5d15d46fd6de7
Author: Jonathan R. Madsen <jrmadsen@users.noreply.github.com>
Date:   Thu Jul 13 14:38:56 2023 -0500

    Formatting workflows for source (clang-format) and cmake (cmake-format) (#4)

    * Add .cmake-format.yaml file

    * Add formatting workflow

    * provide base input for creating PR

    * Update scheme for extracting branch name

    - disable running formatting on push to amd-staging branch

    * patch .cmake-format.yaml for find_package signature

    - apparently cmake-format doesn't format the full signature of find_package

    * run formatting (clang-format v11) (#7)

    Co-authored-by: jrmadsen <jrmadsen@users.noreply.github.com>

    * run cmake formatting (cmake-format) (#6)

    Co-authored-by: jrmadsen <jrmadsen@users.noreply.github.com>

    ---------

    Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

commit bc4d135fdd8a1a9e51235f18a5d575fd2b3735e6
Author: Ammar ELWazir <aelwazir@amd.com>
Date:   Thu Jul 13 12:55:17 2023 -0500

    Removing Build cache for potential issues with auto-generated header files (#5)

    Change-Id: I9e2319f4335e2f88585ffa6fac2bd88a1c952e6e

commit ce86dea6a311d44d880fa684eb78f3329295e2a4
Author: Jonathan R. Madsen <jrmadsen@users.noreply.github.com>
Date:   Thu Jul 13 11:08:58 2023 -0500

    Fix decltype(<hsa-function>) function pointer usage (#3)

    - the following is done in several places:
        decltype(hsa_memory_allocate)* hsa_memory_allocate
    - above can cause compiler errors
    - replace decltype(<hsa-function>) with decltype(::<hsa-function>)
      - this ensures that the type within the decltype is recognized as the global scope HSA function, not the variable
    - in many places, the variable has a "_fn" suffix to prevent this issue but added '::' anyway for consistency

commit ac49fdd92a72e9c99394253a02da413a6c2e3b3a
Merge: a07946a 03a0855
Author: Ammar ELWazir <aelwazir@amd.com>
Date:   Wed Jul 12 11:36:24 2023 -0500

    Merge pull request #2 from ROCm-Developer-Tools/gerrit-amd-staging

    Pull from gerrit

commit 03a085588cffe863e8f466de67be1cfb205b675a
Merge: e88cad2 a07946a
Author: Ammar ELWazir <aelwazir@amd.com>
Date:   Wed Jul 12 10:57:30 2023 -0500

    Merge branch 'amd-staging' into gerrit-amd-staging

commit a07946a5cd4c670c83c27ad1a076a9d4567ce6d7
Author: Ammar ELWazir <Ammar.ELWazir@amd.com>
Date:   Wed Jul 12 15:46:04 2023 +0000

    Enabling Cached Builds

commit 525e494a7f13941077a8fd4ad6840904db4d27d4
Author: Ammar ELWazir <Ammar.ELWazir@amd.com>
Date:   Wed Jul 12 04:53:54 2023 +0000

    Updating missed GPU Targets

commit 42c75862f628c9bee7cfb7dc04dff2619430efbc
Author: Ammar ELWazir <Ammar.ELWazir@amd.com>
Date:   Wed Jul 12 04:43:02 2023 +0000

    Adding V1 Testing

commit 9d72fd4aee85e4b0c12e717060d2730fa5b73be1
Author: Ammar ELWazir <Ammar.ELWazir@amd.com>
Date:   Wed Jul 12 03:34:31 2023 +0000

    Fixing Artifacts directory path

commit f4000cc558b3b2e4676f7994f7ce8c8e6f94518e
Author: Ammar ELWazir <Ammar.ELWazir@amd.com>
Date:   Wed Jul 12 03:27:26 2023 +0000

    Fixing CMake for test build job

commit 2ce8115d4c33948c3c8f957f545a95a04e1d6cd2
Author: Ammar ELWazir <Ammar.ELWazir@amd.com>
Date:   Wed Jul 12 03:16:18 2023 +0000

    Fixing Ubuntu CMake for ubuntu test build

commit 6d0ed439191be900748d0c025157f9d689a73ec7
Author: Ammar ELWazir <Ammar.ELWazir@amd.com>
Date:   Wed Jul 12 01:28:41 2023 +0000

    Removing Navi21

commit e349a7642e5ae5eb03ab9fcd0a0f74f09f78cab5
Author: Ammar ELWazir <Ammar.ELWazir@amd.com>
Date:   Wed Jul 12 01:14:14 2023 +0000

    Removing Navi21

commit fefd02fe68d2a4bca7ec2e381960ad004ee9fc5b
Author: Ammar ELWazir <Ammar.ELWazir@amd.com>
Date:   Wed Jul 12 00:42:48 2023 +0000

    Fixing CMake Job

commit 2ea46abf7bf92643efa8c549fa70346ffbd79d65
Author: Ammar ELWazir <Ammar.ELWazir@amd.com>
Date:   Wed Jul 12 00:35:13 2023 +0000

    Fixing CMake Job

commit d99d681ed1999c5fcf291dc678b11a77205fb0f3
Author: Ammar ELWazir <Ammar.ELWazir@amd.com>
Date:   Wed Jul 12 00:32:13 2023 +0000

    Fixing Pull Latest Dockers and CMake Jobs

commit dfc4498072d13b4a1df3a63047d34c682c3d9a29
Author: Ammar ELWazir <Ammar.ELWazir@amd.com>
Date:   Tue Jul 11 23:54:21 2023 +0000

    Fixing CMake job

commit 919efe04de707f7c702031be15c3e2c5f8442cbb
Author: Ammar ELWazir <Ammar.ELWazir@amd.com>
Date:   Tue Jul 11 23:52:13 2023 +0000

    Adding Pull Last dockers job

commit be1b1256e8b0e05308e8f7e7e69bee3acca55281
Author: Ammar ELWazir <aelwazir@amd.com>
Date:   Tue Jul 11 18:25:40 2023 -0500

    Update cmake.yml

commit 212299fa4355ae6ec18f9aaacbb79c51ea6c6f97
Author: Ammar ELWazir <aelwazir@amd.com>
Date:   Tue Jul 11 18:23:35 2023 -0500

    Update cmake.yml

commit 7c2c1327086a61466cc6cac39f70865c051a8bc7
Author: Ammar ELWazir <aelwazir@amd.com>
Date:   Tue Jul 11 18:18:53 2023 -0500

    Update cmake.yml

commit 191b5ce007e612e814c1d7a3afb4ad398f3852e1
Author: Ammar ELWazir <aelwazir@amd.com>
Date:   Tue Jul 11 16:03:22 2023 -0500

    Update cmake.yml

commit 8824113d95f3e13c7ce4d0af8e0d9d8f522a6c4a
Author: Ammar ELWazir <Ammar.ELWazir@amd.com>
Date:   Tue Jul 11 16:28:09 2023 +0000

    Fixing Pull from Gerrit job name

    Change-Id: I9e7ed9a27a13ca49d62c93bdadb30f0057e4d385

commit cc3d5e4b02ffb439e8cc2b3efa53527c376f9982
Author: Ammar ELWazir <Ammar.ELWazir@amd.com>
Date:   Tue Jul 11 16:21:43 2023 +0000

    Adding Staging sync job

    Change-Id: I0551f43878b0678ce4b3e74e27d62357cf95ad95

commit b9be2eee71380a2e6dd34d520e92d0c4209277a0
Author: Ammar ELWazir <Ammar.ELWazir@amd.com>
Date:   Tue Jul 11 15:57:11 2023 +0000

    Fixing build.sh

    Change-Id: Ia987b0244f0875370d5fe69907b3f5e9cea914de

commit 9eee33a95a1abd656a7ac5ca10a9f245e9825431
Author: Ammar ELWazir <aelwazir@amd.com>
Date:   Mon Jul 10 21:39:46 2023 -0500

    Update cmake.yml

commit 7093b85a78497140e8b52632ca2a002bdaeacd62
Author: Ammar ELWazir <aelwazir@amd.com>
Date:   Mon Jul 10 21:33:29 2023 -0500

    Update cmake.yml

commit f54697172c72a67740f9fdfa0c217b6ea6931576
Author: Ammar ELWazir <aelwazir@amd.com>
Date:   Mon Jul 10 21:01:26 2023 -0500

    Update cmake.yml

commit 1b6620e16f8940386b0f4f04e69e2410d21c0e26
Author: Ammar ELWazir <aelwazir@amd.com>
Date:   Mon Jul 10 20:21:02 2023 -0500

    Update cmake.yml

commit a94bec740c6b42c4b79c87bca20fa87b99bf060d
Author: Ammar ELWazir <aelwazir@amd.com>
Date:   Mon Jul 10 19:46:35 2023 -0500

    Update cmake.yml

commit 85d6b29d4375a69d575c18ece8542c50f2ddfcc3
Author: Ammar ELWazir <aelwazir@amd.com>
Date:   Mon Jul 10 19:34:39 2023 -0500

    Update cmake.yml

commit 8c004887cf1435f1a6214c3d2455299a8a27bd4c
Author: Ammar ELWazir <aelwazir@amd.com>
Date:   Mon Jul 10 19:31:17 2023 -0500

    Update cmake.yml

commit a14a9168e17d9348a53c6e9c9a47ba1edb4c4509
Author: Ammar ELWazir <aelwazir@amd.com>
Date:   Mon Jul 10 19:25:46 2023 -0500

    Update cmake.yml

commit 000f2f40b84e6a2f7d4becdbf5aed01436ca4c83
Author: Ammar ELWazir <aelwazir@amd.com>
Date:   Mon Jul 10 19:08:18 2023 -0500

    Update cmake.yml

commit a28a53d56731cad848fa9133d1c4dbaa8fc7afa7
Author: Ammar ELWazir <aelwazir@amd.com>
Date:   Mon Jul 10 19:03:39 2023 -0500

    Update cmake.yml

commit a6a2db01027f0b01fdfbb5997ddb772c7f51b649
Author: Ammar ELWazir <aelwazir@amd.com>
Date:   Mon Jul 10 18:21:53 2023 -0500

    Update cmake.yml

commit 118ef2a88b2d44e3207c31c343da3e5e5ec6f176
Author: Ammar ELWazir <aelwazir@amd.com>
Date:   Mon Jul 10 17:55:57 2023 -0500

    Update cmake.yml

commit 03c4c232396440cd0be6d2dd7baf4ceea1c2589d
Author: Ammar ELWazir <aelwazir@amd.com>
Date:   Mon Jul 10 17:48:49 2023 -0500

    Create cmake.yml

Change-Id: I77992f15694e77cbae49c56f9ff02f4f9079235d
2023-07-13 20:54:30 -04:00

1473 строки
50 KiB
C++

/******************************************************************************
Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
///////////////////////////////////////////////////////////////////////////////
// //
// Test tool used as ROC profiler library demo //
// //
///////////////////////////////////////////////////////////////////////////////
#define ROCPROFILER_V1
#include <assert.h>
#include <cxxabi.h>
#include <dirent.h>
#include <hsa/hsa.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/syscall.h> /* For SYS_xxx definitions */
#include <sys/types.h>
#include <unistd.h>
#include <atomic>
#include <chrono>
#include <iostream>
#include <list>
#include <map>
#include <sstream>
#include <string>
#include <thread>
#include <vector>
#include <algorithm>
#include "rocprofiler/rocprofiler.h"
#include "util/hsa_rsrc_factory.h"
#include "util/xml.h"
#define PUBLIC_API __attribute__((visibility("default")))
#define CONSTRUCTOR_API __attribute__((constructor))
#define DESTRUCTOR_API __attribute__((destructor))
#define KERNEL_NAME_LEN_MAX 128
#define ONLOAD_TRACE(str) \
if (getenv("ROCP_ONLOAD_TRACE")) do { \
std::cout << "PID(" << GetPid() << "): PROF_TOOL_LIB::" << __FUNCTION__ << " " << str \
<< std::endl \
<< std::flush; \
} while (0);
#define ONLOAD_TRACE_BEG() ONLOAD_TRACE("begin")
#define ONLOAD_TRACE_END() ONLOAD_TRACE("end")
// Disoatch callback data type
struct callbacks_data_t {
rocprofiler_feature_t* features;
unsigned feature_count;
std::vector<uint32_t>* set;
unsigned group_index;
FILE* file_handle;
int filter_on;
std::vector<uint32_t>* gpu_index;
std::vector<std::string>* kernel_string;
std::vector<uint32_t>* range;
};
// kernel properties structure
struct kernel_properties_t {
uint32_t grid_size;
uint32_t workgroup_size;
uint32_t lds_size;
uint32_t scratch_size;
uint32_t arch_vgpr_count;
uint32_t accum_vgpr_count;
uint32_t sgpr_count;
uint32_t wave_size;
hsa_signal_t signal;
uint64_t object;
};
// Context stored entry type
struct context_entry_t {
bool valid;
bool active;
uint32_t index;
hsa_agent_t agent;
rocprofiler_group_t group;
rocprofiler_feature_t* features;
unsigned feature_count;
rocprofiler_callback_data_t data;
kernel_properties_t kernel_properties;
HsaRsrcFactory::symbols_map_it_t kernel_name_it;
FILE* file_handle;
};
//
const std::string rcfile_name = "rpl_rc.xml";
// verbose mode
static uint32_t verbose = 0;
// Enable tracing
static const bool trace_on = false;
// Tool is unloaded
volatile bool is_loaded = false;
// Dispatch callbacks and context handlers synchronization
pthread_mutex_t mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
// Dispatch callback data
callbacks_data_t* callbacks_data = NULL;
// Stored contexts array
typedef std::map<uint32_t, context_entry_t> context_array_t;
context_array_t* context_array = NULL;
// Contexts collected count
volatile uint32_t context_count = 0;
volatile uint32_t context_collected = 0;
// Profiling results output dir
const char* result_prefix = NULL;
// Global results file handle
FILE* result_file_handle = NULL;
// True if a result file is opened
bool result_file_opened = false;
// Dispatch filters
// Metrics set
std::vector<uint32_t>* metrics_set = NULL;
// GPU index filter
std::vector<uint32_t>* gpu_index_vec = NULL;
// Kernel name filter
std::vector<std::string>* kernel_string_vec = NULL;
// DIspatch number range filter
std::vector<uint32_t>* range_vec = NULL;
// Otstanding dispatches parameters
static uint32_t CTX_OUTSTANDING_WAIT = 1;
static uint32_t CTX_OUTSTANDING_MAX = 0;
static uint32_t CTX_OUTSTANDING_MON = 0;
// to truncate kernel names
uint32_t to_truncate_names = 0;
// local trace buffer
bool is_trace_local = true;
static inline uint32_t GetPid() { return syscall(__NR_getpid); }
static inline uint32_t GetTid() { return syscall(__NR_gettid); }
uint32_t my_pid = GetPid();
// Error handler
void fatal(const std::string msg) {
fflush(stdout);
fprintf(stderr, "%s\n\n", msg.c_str());
fflush(stderr);
abort();
}
// Check returned HSA API status
void check_status(hsa_status_t status) {
if (status != HSA_STATUS_SUCCESS) {
const char* error_string = NULL;
rocprofiler_error_string(&error_string);
fprintf(stderr, "ERROR: %s\n", error_string);
abort();
}
}
//////////////////////////////////////////////////////////////////////////////////////
// Profiling control thread /////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////
uint32_t control_delay_us = 0;
uint32_t control_len_us = 0;
uint32_t control_dist_us = 0;
std::thread* trace_period_thread = nullptr;
std::atomic<bool> trace_period_stop{false};
std::atomic<bool> allow_profiling{false};
void trace_period_fun() {
std::this_thread::sleep_for(std::chrono::milliseconds(control_delay_us));
do {
allow_profiling = true;
if (trace_period_stop) {
allow_profiling = false;
break;
}
std::this_thread::sleep_for(std::chrono::milliseconds(control_len_us));
allow_profiling = false;
if (trace_period_stop) break;
std::this_thread::sleep_for(std::chrono::milliseconds(control_dist_us));
} while (!trace_period_stop);
}
//////////////////////////////////////////////////////////////////////////////////////
// Dispatch opt code /////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////
// Context callback arg
struct callbacks_arg_t {
rocprofiler_pool_t** pools;
};
// Handler callback arg
struct handler_arg_t {
rocprofiler_feature_t* features;
unsigned feature_count;
};
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Print profiling results output break if terminal output is enabled
void results_output_break() {
const bool is_terminal_output = (result_file_opened == false);
if (is_terminal_output) printf("\nROCprofiler results:\n");
}
// Filtering kernel name
std::string filtr_kernel_name(const std::string name) {
auto rit = name.rbegin();
auto rend = name.rend();
uint32_t counter = 0;
char open_token = 0;
char close_token = 0;
while (rit != rend) {
if (counter == 0) {
switch (*rit) {
case ')':
counter = 1;
open_token = ')';
close_token = '(';
break;
case '>':
counter = 1;
open_token = '>';
close_token = '<';
break;
case ']':
counter = 1;
open_token = ']';
close_token = '[';
break;
case ' ':
++rit;
continue;
}
if (counter == 0) break;
} else {
if (*rit == open_token) counter++;
if (*rit == close_token) counter--;
}
++rit;
}
auto rbeg = rit;
while ((rit != rend) && (*rit != ' ') && (*rit != ':')) rit++;
const uint32_t pos = rend - rit;
const uint32_t length = rit - rbeg;
return name.substr(pos, length);
}
// Inflight submits monitoring thread
void* monitor_thr_fun(void*) {
while (context_array != NULL) {
sleep(CTX_OUTSTANDING_MON);
if (pthread_mutex_lock(&mutex) != 0) {
perror("pthread_mutex_lock");
abort();
}
const uint32_t inflight = context_count - context_collected;
std::cerr << std::flush;
std::clog << std::flush;
std::cout << "ROCProfiler: count(" << context_count << "), outstanding(" << inflight << "/"
<< CTX_OUTSTANDING_MAX << ")" << std::endl
<< std::flush;
if (pthread_mutex_unlock(&mutex) != 0) {
perror("pthread_mutex_unlock");
abort();
}
}
return NULL;
}
// Increment profiling context counter value
uint32_t next_context_count() {
if (pthread_mutex_lock(&mutex) != 0) {
perror("pthread_mutex_lock");
abort();
}
++context_count;
if (pthread_mutex_unlock(&mutex) != 0) {
perror("pthread_mutex_unlock");
abort();
}
return context_count;
}
// Allocate entry to store profiling context
context_entry_t* alloc_context_entry() {
if (CTX_OUTSTANDING_MAX != 0) {
while ((context_count - context_collected) > CTX_OUTSTANDING_MAX) usleep(1000);
}
if (pthread_mutex_lock(&mutex) != 0) {
perror("pthread_mutex_lock");
abort();
}
const uint32_t index = next_context_count() - 1;
auto ret = context_array->insert({index, context_entry_t{}});
if (ret.second == false) {
fprintf(stderr, "context_array corruption, index repeated %u\n", index);
abort();
}
if (pthread_mutex_unlock(&mutex) != 0) {
perror("pthread_mutex_unlock");
abort();
}
context_entry_t* entry = &(ret.first->second);
entry->index = index;
return entry;
}
// Allocate entry to store profiling context
void dealloc_context_entry(context_entry_t* entry) {
if (pthread_mutex_lock(&mutex) != 0) {
perror("pthread_mutex_lock");
abort();
}
assert(context_array != NULL);
context_array->erase(entry->index);
if (pthread_mutex_unlock(&mutex) != 0) {
perror("pthread_mutex_unlock");
abort();
}
}
// Global context map
static std::mutex ctx_a_mutex;
typedef std::map<hsa_agent_handle_t, context_entry_t*> ctx_a_map_t;
ctx_a_map_t* ctx_a_map = NULL;
context_entry_t* ck_ctx_entry(hsa_agent_t agent, bool& found) {
std::lock_guard<std::mutex> lock(ctx_a_mutex);
if (ctx_a_map == NULL) ctx_a_map = new ctx_a_map_t;
auto ret = ctx_a_map->insert({agent.handle, NULL});
found = !ret.second;
if (found)
ctx_a_map->erase(agent.handle);
else
ret.first->second = new context_entry_t{};
return ret.first->second;
}
struct trace_data_arg_t {
FILE* file;
const char* label;
hsa_agent_t agent;
};
// Align to specified alignment
unsigned align_size(unsigned size, unsigned alignment) {
return ((size + alignment - 1) & ~(alignment - 1));
}
// Output profiling results for input features
void output_results(const context_entry_t* entry, const char* label) {
FILE* file = entry->file_handle;
const rocprofiler_feature_t* features = entry->features;
const unsigned feature_count = entry->feature_count;
for (unsigned i = 0; i < feature_count; ++i) {
const rocprofiler_feature_t* p = &features[i];
fprintf(file, " %s ", p->name);
switch (p->data.kind) {
// Output metrics results
case ROCPROFILER_DATA_KIND_INT64:
fprintf(file, "(%lu)\n", p->data.result_int64);
break;
case ROCPROFILER_DATA_KIND_DOUBLE:
fprintf(file, "(%.10lf)\n", p->data.result_double);
break;
default:
fprintf(stderr, "RPL-tool: undefined data kind(%u)\n", p->data.kind);
abort();
}
}
}
// Output group intermediate profiling results, created internally for complex metrics
void output_group(const context_entry_t* entry, const char* label) {
const rocprofiler_group_t* group = &(entry->group);
context_entry_t group_entry = *entry;
for (unsigned i = 0; i < group->feature_count; ++i) {
if (group->features[i]->data.kind == ROCPROFILER_DATA_KIND_INT64 ||
group->features[i]->data.kind == ROCPROFILER_DATA_KIND_DOUBLE) {
group_entry.features = group->features[i];
group_entry.feature_count = 1;
output_results(&group_entry, label);
}
}
}
// Dump stored context entry
bool dump_context_entry(context_entry_t* entry, bool to_clean = true) {
hsa_status_t status = HSA_STATUS_ERROR;
volatile std::atomic<bool>* valid = reinterpret_cast<std::atomic<bool>*>(&entry->valid);
while (valid->load() == false) sched_yield();
const rocprofiler_dispatch_record_t* record = entry->data.record;
if (record) {
if (record->complete == 0) {
return false;
}
}
++context_collected;
const uint32_t index = entry->index;
if (index != UINT32_MAX) {
FILE* file_handle = entry->file_handle;
const std::string nik_name = (to_truncate_names == 0)
? entry->data.kernel_name
: filtr_kernel_name(entry->data.kernel_name);
const AgentInfo* agent_info = HsaRsrcFactory::Instance().GetAgentInfo(entry->agent);
fprintf(file_handle,
"dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), pid(%u), tid(%u), grd(%u), "
"wgr(%u), lds(%u), scr(%u), arch_vgpr(%u), accum_vgpr(%u), sgpr(%u), wave_size(%u), "
"sig(0x%lx), obj(0x%lx), kernel-name(\"%s\")",
index, agent_info->dev_index, entry->data.queue_id, entry->data.queue_index, my_pid,
entry->data.thread_id, entry->kernel_properties.grid_size,
entry->kernel_properties.workgroup_size,
(entry->kernel_properties.lds_size + (AgentInfo::lds_block_size - 1)) &
~(AgentInfo::lds_block_size - 1),
entry->kernel_properties.scratch_size, entry->kernel_properties.arch_vgpr_count,
entry->kernel_properties.accum_vgpr_count, entry->kernel_properties.sgpr_count,
entry->kernel_properties.wave_size, entry->kernel_properties.signal.handle,
entry->kernel_properties.object, nik_name.c_str());
if (record)
fprintf(file_handle, ", time(%lu,%lu,%lu,%lu)", record->dispatch, record->begin, record->end,
record->complete);
fprintf(file_handle, "\n");
fflush(file_handle);
}
if (record && to_clean) {
delete record;
entry->data.record = NULL;
}
rocprofiler_group_t& group = entry->group;
if (group.context != NULL) {
if (entry->feature_count > 0) {
status = rocprofiler_group_get_data(&group);
check_status(status);
if (verbose == 1) output_group(entry, "group0-data");
status = rocprofiler_get_metrics(group.context);
check_status(status);
}
std::ostringstream oss;
oss << index << "__" << filtr_kernel_name(entry->data.kernel_name);
output_results(entry, oss.str().substr(0, KERNEL_NAME_LEN_MAX).c_str());
if (to_clean) free(const_cast<char*>(entry->data.kernel_name));
// Finishing cleanup
// Deleting profiling context will delete all allocated resources
if (to_clean) rocprofiler_close(group.context);
}
return true;
}
// Wait for and dump all stored contexts for a given queue if not NULL
void dump_context_array(hsa_queue_t* queue) {
bool done = false;
while (done == false) {
done = true;
if (pthread_mutex_lock(&mutex) != 0) {
perror("pthread_mutex_lock");
abort();
}
if (context_array) {
auto it = context_array->begin();
auto end = context_array->end();
while (it != end) {
auto cur = it++;
context_entry_t* entry = &(cur->second);
volatile std::atomic<bool>* valid = reinterpret_cast<std::atomic<bool>*>(&entry->valid);
while (valid->load() == false) sched_yield();
if ((queue == NULL) || (entry->data.queue == queue)) {
if (entry->active == true) {
if (dump_context_entry(&(cur->second)) == false)
done = false;
else
entry->active = false;
}
}
}
}
if (pthread_mutex_unlock(&mutex) != 0) {
perror("pthread_mutex_unlock");
abort();
}
if (done == false) sched_yield();
}
}
// Profiling completion handler
// Dump and delete the context entry
bool context_handler(rocprofiler_group_t group, void* arg) {
context_entry_t* entry = reinterpret_cast<context_entry_t*>(arg);
if (pthread_mutex_lock(&mutex) != 0) {
perror("pthread_mutex_lock");
abort();
}
bool ret = true;
if (entry->active == true) {
ret = dump_context_entry(entry);
if (ret == false) {
fprintf(stderr, "tool error: context is not complete\n");
abort();
}
}
if (ret) dealloc_context_entry(entry);
if (trace_on) {
fprintf(stdout, "tool::handler: context_array %d tid %u\n", (int)(context_array->size()),
GetTid());
fflush(stdout);
}
if (pthread_mutex_unlock(&mutex) != 0) {
perror("pthread_mutex_unlock");
abort();
}
return false;
}
// Profiling completion handler
// Dump context entry
bool context_pool_handler(const rocprofiler_pool_entry_t* entry, void* arg) {
// Context entry
context_entry_t* ctx_entry = reinterpret_cast<context_entry_t*>(entry->payload);
handler_arg_t* handler_arg = reinterpret_cast<handler_arg_t*>(arg);
ctx_entry->features = handler_arg->features;
ctx_entry->feature_count = handler_arg->feature_count;
ctx_entry->data.kernel_name = ctx_entry->kernel_name_it->second.name;
ctx_entry->file_handle = result_file_handle;
if (pthread_mutex_lock(&mutex) != 0) {
perror("pthread_mutex_lock");
abort();
}
dump_context_entry(ctx_entry, false);
if (pthread_mutex_unlock(&mutex) != 0) {
perror("pthread_mutex_unlock");
abort();
}
HsaRsrcFactory::ReleaseKernelNameRef(ctx_entry->kernel_name_it);
return false;
}
// Profiling completion handler for concurrent implementation
// Dump the context entry
// Return true if the context was dumped successfully
bool context_handler_con(rocprofiler_group_t group, void* arg) {
context_entry_t* entry = reinterpret_cast<context_entry_t*>(arg);
if (pthread_mutex_lock(&mutex) != 0) {
perror("pthread_mutex_lock");
abort();
}
bool ret = true;
ret = dump_context_entry(entry);
if (ret == false) {
fprintf(stderr, "tool error: context is not complete\n");
abort();
}
if (trace_on) {
fprintf(stdout, "tool::handler_con: context_map %d tid %u\n", (int)(ctx_a_map->size()),
GetTid());
fflush(stdout);
}
if (pthread_mutex_unlock(&mutex) != 0) {
perror("pthread_mutex_unlock");
abort();
}
return false;
}
bool check_filter(const rocprofiler_callback_data_t* callback_data,
const callbacks_data_t* tool_data) {
bool found = true;
std::vector<uint32_t>* range_ptr = tool_data->range;
if (found && range_ptr) {
found = false;
std::vector<uint32_t>& range = *range_ptr;
if (range.size() == 1) {
if (context_count >= range[0]) found = true;
} else if (range.size() == 2) {
if ((context_count >= range[0]) && (context_count < range[1])) found = true;
}
}
std::vector<uint32_t>* gpu_index = tool_data->gpu_index;
if (found && gpu_index) {
found = false;
for (uint32_t i : *gpu_index) {
if (i == callback_data->agent_index) {
found = true;
}
}
}
std::vector<std::string>* kernel_string = tool_data->kernel_string;
if (found && kernel_string) {
found = false;
for (const std::string& s : *kernel_string) {
if (std::string(callback_data->kernel_name).find(s) != std::string::npos) {
found = true;
}
}
}
return found;
}
struct kernel_descriptor_t {
uint8_t reserved0[16];
int64_t kernel_code_entry_byte_offset;
uint8_t reserved1[20];
uint32_t compute_pgm_rsrc3;
uint32_t compute_pgm_rsrc1;
uint32_t compute_pgm_rsrc2;
uint16_t kernel_code_properties;
uint8_t reserved2[6];
};
// AMD Compute Program Resource Register Three.
typedef uint32_t amd_compute_pgm_rsrc_three32_t;
enum amd_compute_gfx9_pgm_rsrc_three_t {
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_ACCUM_OFFSET, 0, 5),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_TG_SPLIT, 16, 1)
};
enum amd_compute_gfx10_gfx11_pgm_rsrc_three_t {
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_SHARED_VGPR_COUNT, 0, 4),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_INST_PREF_SIZE, 4, 6),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_TRAP_ON_START, 10, 1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_TRAP_ON_END, 11, 1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_IMAGE_OP, 31, 1)
};
// Kernel code properties.
enum amd_kernel_code_property_t {
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, 0,
1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, 1, 1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, 2, 1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR, 3, 1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, 4, 1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, 6, 1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_RESERVED0, 7, 3),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32, 10,
1), // GFX10+
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK, 11, 1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_RESERVED1, 12, 4),
};
static const kernel_descriptor_t* GetKernelCode(uint64_t kernel_object) {
const kernel_descriptor_t* kernel_code = NULL;
hsa_status_t status =
HsaRsrcFactory::Instance().LoaderApi()->hsa_ven_amd_loader_query_host_address(
reinterpret_cast<const void*>(kernel_object),
reinterpret_cast<const void**>(&kernel_code));
if (HSA_STATUS_SUCCESS != status) {
kernel_code = reinterpret_cast<kernel_descriptor_t*>(kernel_object);
}
return kernel_code;
}
static uint32_t arch_vgpr_count(const AgentInfo& info, const kernel_descriptor_t& kernel_code) {
if (strcmp(info.name, "gfx90a") == 0 || strncmp(info.name, "gfx94", 5) == 0)
return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc3,
AMD_COMPUTE_PGM_RSRC_THREE_ACCUM_OFFSET) +
1) *
4;
return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1,
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT) +
1) *
(AMD_HSA_BITS_GET(kernel_code.kernel_code_properties,
AMD_KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32)
? 8
: 4);
}
static uint32_t accum_vgpr_count(const AgentInfo& info, const kernel_descriptor_t& kernel_code) {
if (strcmp(info.name, "gfx908") == 0) return arch_vgpr_count(info, kernel_code);
if (strcmp(info.name, "gfx90a") == 0 || strncmp(info.name, "gfx94", 5) == 0)
return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1,
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT) +
1) *
8 -
arch_vgpr_count(info, kernel_code);
return 0;
}
static uint32_t sgpr_count(const AgentInfo& info, const kernel_descriptor_t& kernel_code) {
// GFX10 and later always allocate 128 sgprs.
if (std::atoi(&info.gfxip[3]) >= 10) return 128;
return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1,
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT) /
2 +
1) *
16;
}
// Setting kernel properties
void set_kernel_properties(const rocprofiler_callback_data_t* callback_data,
context_entry_t* entry) {
const hsa_kernel_dispatch_packet_t* packet = callback_data->packet;
kernel_properties_t* kernel_properties_ptr = &(entry->kernel_properties);
const kernel_descriptor_t* kernel_code = (kernel_descriptor_t*)callback_data->kernel_code;
entry->data = *callback_data;
if (kernel_code == NULL) {
const uint64_t kernel_object = callback_data->packet->kernel_object;
kernel_code = GetKernelCode(kernel_object);
entry->kernel_name_it = HsaRsrcFactory::AcquireKernelNameRef(kernel_object);
} else {
entry->data.kernel_name = strdup(callback_data->kernel_name);
}
uint64_t grid_size = packet->grid_size_x * packet->grid_size_y * packet->grid_size_z;
if (grid_size > UINT32_MAX) abort();
kernel_properties_ptr->grid_size = (uint32_t)grid_size;
uint64_t workgroup_size =
packet->workgroup_size_x * packet->workgroup_size_y * packet->workgroup_size_z;
if (workgroup_size > UINT32_MAX) abort();
kernel_properties_ptr->workgroup_size = (uint32_t)workgroup_size;
kernel_properties_ptr->lds_size = packet->group_segment_size;
kernel_properties_ptr->scratch_size = packet->private_segment_size;
const AgentInfo* agent_info = HsaRsrcFactory::Instance().GetAgentInfo(callback_data->agent);
assert(agent_info != nullptr);
kernel_properties_ptr->arch_vgpr_count = arch_vgpr_count(*agent_info, *kernel_code);
kernel_properties_ptr->accum_vgpr_count = accum_vgpr_count(*agent_info, *kernel_code);
kernel_properties_ptr->sgpr_count = sgpr_count(*agent_info, *kernel_code);
kernel_properties_ptr->wave_size =
AMD_HSA_BITS_GET(kernel_code->kernel_code_properties,
AMD_KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32)
? 32
: 64;
kernel_properties_ptr->signal = callback_data->completion_signal;
kernel_properties_ptr->object = callback_data->packet->kernel_object;
}
// Kernel disoatch callback
hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* user_data,
rocprofiler_group_t* group) {
if (!allow_profiling) return HSA_STATUS_SUCCESS;
// TODO: return success, make atomic flag
// Passed tool data
callbacks_data_t* tool_data = reinterpret_cast<callbacks_data_t*>(user_data);
// HSA status
hsa_status_t status = HSA_STATUS_ERROR;
// Checking dispatch condition
if (tool_data->filter_on == 1) {
if (check_filter(callback_data, tool_data) == false) {
next_context_count();
return HSA_STATUS_SUCCESS;
}
}
// Profiling context
// Context entry
context_entry_t* entry = alloc_context_entry();
// Setting kernel properties
set_kernel_properties(callback_data, entry);
// context properties
rocprofiler_properties_t properties{};
properties.handler = (result_prefix != NULL) ? context_handler : NULL;
properties.handler_arg = (void*)entry;
rocprofiler_feature_t* features = tool_data->features;
unsigned feature_count = tool_data->feature_count;
if (tool_data->set != NULL) {
uint32_t set_offset = 0;
uint32_t next_offset = 0;
const auto entry_index = entry->index;
if (entry_index < (tool_data->set->size() - 1)) {
set_offset = (*(tool_data->set))[entry_index];
next_offset = (*(tool_data->set))[entry_index + 1];
} else {
set_offset = tool_data->set->back();
next_offset = feature_count;
}
features += set_offset;
feature_count = next_offset - set_offset;
}
// Open profiling context
rocprofiler_t* context = NULL;
status = rocprofiler_open(callback_data->agent, features, feature_count, &context,
0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties);
check_status(status);
// Check that we have only one profiling group
uint32_t group_count = 0;
status = rocprofiler_group_count(context, &group_count);
check_status(status);
assert(group_count == 1);
// Get group[0]
const uint32_t group_index = 0;
status = rocprofiler_get_group(context, group_index, group);
check_status(status);
// Fill profiling context entry
entry->agent = callback_data->agent;
entry->group = *group;
entry->features = features;
entry->feature_count = feature_count;
entry->file_handle = tool_data->file_handle;
entry->active = true;
reinterpret_cast<std::atomic<bool>*>(&entry->valid)->store(true);
if (trace_on) {
fprintf(stdout, "tool::dispatch: context_array %d tid %u\n", (int)(context_array->size()),
GetTid());
fflush(stdout);
}
return status;
}
// Kernel disoatch callback
hsa_status_t dispatch_callback_opt(const rocprofiler_callback_data_t* callback_data,
void* user_data, rocprofiler_group_t* group) {
hsa_status_t status = HSA_STATUS_ERROR;
hsa_agent_t agent = callback_data->agent;
const unsigned gpu_id = HsaRsrcFactory::Instance().GetAgentInfo(agent)->dev_index;
callbacks_arg_t* callbacks_arg = reinterpret_cast<callbacks_arg_t*>(user_data);
rocprofiler_pool_t* pool = callbacks_arg->pools[gpu_id];
rocprofiler_pool_entry_t pool_entry{};
status = rocprofiler_pool_fetch(pool, &pool_entry);
check_status(status);
// Profiling context entry
rocprofiler_t* context = pool_entry.context;
context_entry_t* entry = reinterpret_cast<context_entry_t*>(pool_entry.payload);
// Setting kernel properties
set_kernel_properties(callback_data, entry);
// Get group[0]
status = rocprofiler_get_group(context, 0, group);
check_status(status);
// Fill profiling context entry
entry->index = UINT32_MAX;
entry->agent = agent;
entry->group = *group;
reinterpret_cast<std::atomic<bool>*>(&entry->valid)->store(true);
return status;
}
hsa_status_t destroy_callback(hsa_queue_t* queue, void*) {
results_output_break();
dump_context_array(queue);
return HSA_STATUS_SUCCESS;
}
static hsa_status_t info_callback(const rocprofiler_info_data_t info, void* arg) {
const char symb = *reinterpret_cast<const char*>(arg);
if (((symb == 'b') && (info.metric.expr == NULL)) ||
((symb == 'd') && (info.metric.expr != NULL))) {
if (info.metric.expr != NULL) {
fprintf(stdout, "\n gpu-agent%d : %s : %s\n", info.agent_index, info.metric.name,
info.metric.description);
fprintf(stdout, " %s = %s\n", info.metric.name, info.metric.expr);
} else {
fprintf(stdout, "\n gpu-agent%d : %s", info.agent_index, info.metric.name);
if (info.metric.instances > 1) fprintf(stdout, "[0-%u]", info.metric.instances - 1);
fprintf(stdout, " : %s\n", info.metric.description);
fprintf(stdout, " block %s has %u counters\n", info.metric.block_name,
info.metric.block_counters);
}
fflush(stdout);
}
return HSA_STATUS_SUCCESS;
}
std::string normalize_token(const std::string& token, bool not_empty, const std::string& label) {
const std::string space_chars_set = " \t";
const size_t first_pos = token.find_first_not_of(space_chars_set);
size_t norm_len = 0;
std::string error_str = "none";
if (first_pos != std::string::npos) {
const size_t last_pos = token.find_last_not_of(space_chars_set);
if (last_pos == std::string::npos)
error_str = "token string error: \"" + token + "\"";
else {
const size_t end_pos = last_pos + 1;
if (end_pos <= first_pos)
error_str = "token string error: \"" + token + "\"";
else
norm_len = end_pos - first_pos;
}
}
if (((first_pos != std::string::npos) && (norm_len == 0)) ||
((first_pos == std::string::npos) && not_empty)) {
fatal("normalize_token error, " + label + ": '" + token + "'," + error_str);
}
return (norm_len != 0) ? token.substr(first_pos, norm_len) : std::string("");
}
int get_xml_array(const xml::Xml::level_t* node, const std::string& field, const std::string& delim,
std::vector<std::string>* vec, const char* label = NULL) {
int parse_iter = 0;
const auto& opts = node->opts;
auto it = opts.find(field);
if (it != opts.end()) {
const std::string array_string = it->second;
if (label != NULL) printf("%s%s = %s\n", label, field.c_str(), array_string.c_str());
size_t pos1 = 0;
const size_t string_len = array_string.length();
while (pos1 < string_len) {
const size_t pos2 = array_string.find(delim, pos1);
const bool found = (pos2 != std::string::npos);
const size_t token_len = (pos2 != std::string::npos) ? pos2 - pos1 : string_len - pos1;
const std::string token = array_string.substr(pos1, token_len);
const std::string norm_str = normalize_token(token, found, "get_xml_array");
if (norm_str.length() != 0) vec->push_back(norm_str);
if (!found) break;
pos1 = pos2 + 1;
++parse_iter;
}
}
return parse_iter;
}
int get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& field,
const std::string& delim, std::vector<std::string>* vec,
const char* label = NULL) {
int parse_iter = 0;
const auto nodes = xml->GetNodes(tag);
auto rit = nodes.rbegin();
const auto rend = nodes.rend();
while (rit != rend) {
auto& opts = (*rit)->opts;
if (opts.find(field) != opts.end()) break;
++rit;
}
if (rit != rend) {
parse_iter = get_xml_array(*rit, field, delim, vec, label);
// fatal("Tokens array parsing error, file '" + xml->GetName() + "', " + tag + "::" + field);
}
return parse_iter;
}
int get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& field,
const std::string& delim, std::vector<uint32_t>* vec, const char* label = NULL) {
std::vector<std::string> str_vec;
const int parse_iter = get_xml_array(xml, tag, field, delim, &str_vec, label);
for (const std::string& str : str_vec) vec->push_back(atoi(str.c_str()));
return parse_iter;
}
static inline void check_env_var(const char* var_name, uint32_t& val) {
const char* str = getenv(var_name);
if (str != NULL) val = atol(str);
}
static inline void check_env_var(const char* var_name, uint64_t& val) {
const char* str = getenv(var_name);
if (str != NULL) val = atoll(str);
}
// HSA intercepting routines
// HSA unified callback function
hsa_status_t hsa_unified_callback(rocprofiler_hsa_cb_id_t id,
const rocprofiler_hsa_callback_data_t* data, void* arg) {
printf("hsa_unified_callback(%d, %p, %p):\n", (int)id, data, arg);
if (data == NULL) abort();
switch (id) {
case ROCPROFILER_HSA_CB_ID_ALLOCATE:
printf(" alloc ptr = %p\n", data->allocate.ptr);
printf(" alloc size = %zu\n", data->allocate.size);
printf(" segment type = 0x%x\n", data->allocate.segment);
printf(" global flag = 0x%x\n", data->allocate.global_flag);
printf(" is_code = %x\n", data->allocate.is_code);
break;
case ROCPROFILER_HSA_CB_ID_DEVICE:
printf(" device type = 0x%x\n", data->device.type);
printf(" device id = %u\n", data->device.id);
printf(" device agent = 0x%lx\n", data->device.agent.handle);
printf(" assigned ptr = %p\n", data->device.ptr);
break;
case ROCPROFILER_HSA_CB_ID_MEMCOPY:
printf(" memcopy dst = %p\n", data->memcopy.dst);
printf(" memcopy src = %p\n", data->memcopy.src);
printf(" memcopy size = %zu\n", data->memcopy.size);
break;
case ROCPROFILER_HSA_CB_ID_SUBMIT:
printf(" packet %p\n", data->submit.packet);
if (data->submit.kernel_name != NULL) {
printf(" submit kernel \"%s\"\n", data->submit.kernel_name);
printf(" device type = %u\n", data->submit.device_type);
printf(" device id = %u\n", data->submit.device_id);
}
break;
default:
printf("Unknown callback id(%u)\n", id);
abort();
}
fflush(stdout);
return HSA_STATUS_SUCCESS;
}
// HSA callbacks structure
rocprofiler_hsa_callbacks_t hsa_callbacks{hsa_unified_callback,
hsa_unified_callback,
hsa_unified_callback,
hsa_unified_callback,
NULL,
NULL};
// HSA kernel symbol callback
hsa_status_t hsa_ksymbol_cb(rocprofiler_hsa_cb_id_t id, const rocprofiler_hsa_callback_data_t* data,
void* arg) {
HsaRsrcFactory::SetKernelNameRef(data->ksymbol.object, data->ksymbol.name, data->ksymbol.unload);
return HSA_STATUS_SUCCESS;
}
// Tool constructor
extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) {
ONLOAD_TRACE_BEG();
if (pthread_mutex_lock(&mutex) != 0) {
perror("pthread_mutex_lock");
abort();
}
if (is_loaded) return;
is_loaded = true;
if (pthread_mutex_unlock(&mutex) != 0) {
perror("pthread_mutex_unlock");
abort();
}
// Loading configuration rcfile
std::string rcpath = std::string("./") + rcfile_name;
xml::Xml* rcfile = xml::Xml::Create(rcpath);
const char* home_dir = getenv("HOME");
if (rcfile == NULL && home_dir != NULL) {
rcpath = std::string(home_dir) + "/" + rcfile_name;
rcfile = xml::Xml::Create(rcpath);
}
const char* pkg_dir = getenv("ROCP_PACKAGE_DIR");
if (rcfile == NULL && pkg_dir != NULL) {
rcpath = std::string(pkg_dir) + "/" + rcfile_name;
rcfile = xml::Xml::Create(rcpath);
}
if (rcfile != NULL) {
// Getting defaults
printf("ROCProfiler pid(%u): rc-file '%s'\n", GetPid(), rcpath.c_str());
auto defaults_list = rcfile->GetNodes("top.defaults");
for (auto* entry : defaults_list) {
const auto& opts = entry->opts;
auto it = opts.find("basenames");
if (it != opts.end()) {
to_truncate_names = (it->second == "on") ? 1 : 0;
}
it = opts.find("timestamp");
if (it != opts.end()) {
settings->timestamp_on = (it->second == "on") ? 1 : 0;
}
it = opts.find("ctx-wait");
if (it != opts.end()) {
CTX_OUTSTANDING_WAIT = atol(it->second.c_str());
}
it = opts.find("ctx-limit");
if (it != opts.end()) {
CTX_OUTSTANDING_MAX = atol(it->second.c_str());
}
it = opts.find("heartbeat");
if (it != opts.end()) {
CTX_OUTSTANDING_MON = atol(it->second.c_str());
}
it = opts.find("trace-size");
if (it != opts.end()) {
std::string str = normalize_token(it->second, true, "option trace-size");
uint32_t multiplier = 1;
switch (str.back()) {
case 'K':
multiplier = 1024;
break;
case 'M':
multiplier = 1024 * 1024;
break;
}
if (multiplier != 1) str = str.substr(0, str.length() - 1);
settings->trace_size = strtoull(str.c_str(), NULL, 0) * multiplier;
}
it = opts.find("trace-local");
if (it != opts.end()) {
settings->trace_local = (it->second == "on");
}
it = opts.find("obj-tracking");
if (it != opts.end()) {
settings->code_obj_tracking = (it->second == "on");
}
it = opts.find("memcopies");
if (it != opts.end()) {
settings->memcopy_tracking = (it->second == "on");
}
}
}
// Enable verbose mode
check_env_var("ROCP_VERBOSE_MODE", verbose);
// Enable kernel names truncating
check_env_var("ROCP_TRUNCATE_NAMES", to_truncate_names);
// Set outstanding dispatches parameter
check_env_var("ROCP_OUTSTANDING_WAIT", CTX_OUTSTANDING_WAIT);
check_env_var("ROCP_OUTSTANDING_MAX", CTX_OUTSTANDING_MAX);
check_env_var("ROCP_OUTSTANDING_MON", CTX_OUTSTANDING_MON);
// Enable timestamping
check_env_var("ROCP_TIMESTAMP_ON", settings->timestamp_on);
// Set data timeout
check_env_var("ROCP_DATA_TIMEOUT", settings->timeout);
// Set trace size
check_env_var("ROCP_TRACE_SIZE", settings->trace_size);
// Set trace local buffer
check_env_var("ROCP_TRACE_LOCAL", settings->trace_local);
// Set code objects tracking
check_env_var("ROCP_OBJ_TRACKING", settings->code_obj_tracking);
// Set memcopies tracking
check_env_var("ROCP_MCOPY_TRACKING", settings->memcopy_tracking);
// Set HSA intercepting
check_env_var("ROCP_HSA_INTERC", settings->hsa_intercepting);
if (settings->hsa_intercepting) rocprofiler_set_hsa_callbacks(hsa_callbacks, (void*)14);
// Enable concurrent mode
check_env_var("ROCP_K_CONCURRENT", settings->k_concurrent);
// Enable optmized mode
check_env_var("ROCP_OPT_MODE", settings->opt_mode);
is_trace_local = settings->trace_local;
// Printing out info
char* info_symb = getenv("ROCP_INFO");
if (info_symb != NULL) {
if (*info_symb != 'b' && *info_symb != 'd') {
fprintf(stderr, "ROCProfiler: bad info symbol '%c', ROCP_INFO env", *info_symb);
} else {
if (*info_symb == 'b')
printf("Basic HW counters:\n");
else
printf("Derived metrics:\n");
hsa_status_t status =
rocprofiler_iterate_info(NULL, ROCPROFILER_INFO_KIND_METRIC, info_callback, info_symb);
check_status(status);
}
exit(1);
}
// Set output file
result_prefix = getenv("ROCP_OUTPUT_DIR");
if (result_prefix != NULL) {
DIR* dir = opendir(result_prefix);
if (dir == NULL) {
std::ostringstream errmsg;
errmsg << "ROCProfiler: Cannot open output directory '" << result_prefix << "'";
perror(errmsg.str().c_str());
abort();
}
std::ostringstream oss;
oss << result_prefix << "/" << GetPid() << "_results.txt";
result_file_handle = fopen(oss.str().c_str(), "w");
if (result_file_handle == NULL) {
std::ostringstream errmsg;
errmsg << "ROCProfiler: fopen error, file '" << oss.str().c_str() << "'";
perror(errmsg.str().c_str());
abort();
}
} else
result_file_handle = stdout;
result_file_opened = (result_prefix != NULL) && (result_file_handle != NULL);
// Getting input
const char* xml_name = getenv("ROCP_INPUT");
if (xml_name == NULL) fatal("ROCProfiler: input is not specified, ROCP_INPUT env");
printf("ROCProfiler: input from \"%s\"\n", xml_name);
xml::Xml* xml = xml::Xml::Create(xml_name);
if (xml == NULL) {
fprintf(stderr, "ROCProfiler: Input file not found '%s'\n", xml_name);
abort();
}
// Getting metrics
std::vector<std::string> metrics_vec;
get_xml_array(xml, "top.metric", "name", ",", &metrics_vec);
// Metrics set
metrics_set = new std::vector<uint32_t>;
get_xml_array(xml, "top.metric", "set", ",", metrics_set, " ");
if (metrics_set->size() != 0) {
uint32_t accum = 0;
metrics_set->insert(metrics_set->begin(), 0);
for (auto it = metrics_set->begin(); it != metrics_set->end(); ++it) {
accum += *it;
*it = accum;
}
}
// Getting GPU indexes
gpu_index_vec = new std::vector<uint32_t>;
get_xml_array(xml, "top.metric", "gpu_index", ",", gpu_index_vec, " ");
// Skipping cpu count to get to correct gpu index
const uint32_t cpu_count = HsaRsrcFactory::Instance().GetCountOfCpuAgents();
std::transform(gpu_index_vec->begin(), gpu_index_vec->end(), gpu_index_vec->begin(),
[&](int count) { return count + cpu_count; });
// Getting kernel names
kernel_string_vec = new std::vector<std::string>;
get_xml_array(xml, "top.metric", "kernel", ",", kernel_string_vec, " ");
// Getting profiling range
range_vec = new std::vector<uint32_t>;
const int range_parse_iter = get_xml_array(xml, "top.metric", "range", ":", range_vec, " ");
if ((range_vec->size() > 2) || (range_parse_iter > 1)) {
fatal("Bad range format, input file " + xml->GetName());
}
if ((range_vec->size() == 1) && (range_parse_iter == 0)) {
range_vec->push_back(*(range_vec->begin()) + 1);
}
const bool filter_disabled =
(gpu_index_vec->empty() && kernel_string_vec->empty() && range_vec->empty());
// Getting traces
const auto traces_list = xml->GetNodes("top.trace");
if (traces_list.size() > 1) fatal("ROCProfiler: only one trace supported at a time");
const unsigned feature_count = metrics_vec.size() + traces_list.size();
rocprofiler_feature_t* features = new rocprofiler_feature_t[feature_count];
memset(features, 0, feature_count * sizeof(rocprofiler_feature_t));
printf(" %d metrics\n", (int)metrics_vec.size());
for (unsigned i = 0; i < metrics_vec.size(); ++i) {
const std::string& name = metrics_vec[i];
printf("%s%s", (i == 0) ? " " : ", ", name.c_str());
features[i] = {};
features[i].kind = ROCPROFILER_FEATURE_KIND_METRIC;
features[i].name = strdup(name.c_str());
}
if (metrics_vec.size()) printf("\n");
const uint32_t features_found = metrics_vec.size();
// Getting Trace Period
const char* ctrl_str = getenv("ROCP_CTRL_RATE");
if (ctrl_str != nullptr) {
uint32_t ctrl_delay = 0;
uint32_t ctrl_len = 0;
uint32_t ctrl_rate = 0;
if (sscanf(ctrl_str, "%d:%d:%d", &ctrl_delay, &ctrl_len, &ctrl_rate) != 3 ||
ctrl_len > ctrl_rate)
std::cerr << "Invalid ROCP_CTRL_RATE variable (ctrl_delay:ctrl_len:ctrl_rate)" << std::endl;
control_dist_us = ctrl_rate - ctrl_len;
control_len_us = ctrl_len;
control_delay_us = ctrl_delay;
if (ctrl_delay != UINT32_MAX) {
std::cout << "ROCProfiler: trace control: delay(" << ctrl_delay << "us), length(" << ctrl_len
<< "us), rate(" << ctrl_rate << "us)" << std::endl;
trace_period_thread = new std::thread(trace_period_fun);
} else {
std::cout << "ROCProfiler: trace start disabled" << std::endl;
}
} else {
allow_profiling = true;
}
// Context array aloocation
context_array = new context_array_t;
bool opt_mode_cond =
((features_found != 0) && (metrics_set->empty()) && (filter_disabled == true));
if (settings->opt_mode == 0) opt_mode_cond = false;
if (!opt_mode_cond) settings->opt_mode = 0;
if (opt_mode_cond) {
// Handler arg
handler_arg_t* handler_arg = new handler_arg_t{};
handler_arg->features = features;
handler_arg->feature_count = feature_count;
// Context properties
rocprofiler_pool_properties_t properties{};
properties.num_entries = (CTX_OUTSTANDING_MAX != 0) ? CTX_OUTSTANDING_MAX : 1000;
properties.payload_bytes = sizeof(context_entry_t);
properties.handler = context_pool_handler;
properties.handler_arg = handler_arg;
// Available GPU agents
const unsigned gpu_count = HsaRsrcFactory::Instance().GetCountOfGpuAgents();
callbacks_arg_t* callbacks_arg = new callbacks_arg_t{};
callbacks_arg->pools = new rocprofiler_pool_t*[gpu_count];
for (unsigned gpu_id = 0; gpu_id < gpu_count; gpu_id++) {
// Getting GPU device info
const AgentInfo* agent_info = NULL;
if (HsaRsrcFactory::Instance().GetGpuAgentInfo(gpu_id, &agent_info) == false) {
fprintf(stderr, "GetGpuAgentInfo failed\n");
abort();
}
// Open profiling pool
rocprofiler_pool_t* pool = NULL;
hsa_status_t status = rocprofiler_pool_open(agent_info->dev_id, features, features_found,
&pool, 0, &properties);
check_status(status);
callbacks_arg->pools[gpu_id] = pool;
}
// Adding dispatch observer
rocprofiler_queue_callbacks_t callbacks_ptrs{0};
callbacks_ptrs.dispatch = dispatch_callback_opt;
callbacks_ptrs.destroy = destroy_callback;
rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_arg);
rocprofiler_hsa_callbacks_t cs{};
cs.ksymbol = hsa_ksymbol_cb;
rocprofiler_set_hsa_callbacks(cs, NULL);
settings->code_obj_tracking = 0;
settings->hsa_intercepting = 1;
} else {
// Adding dispatch observer
rocprofiler_queue_callbacks_t callbacks_ptrs{0};
callbacks_ptrs.dispatch = dispatch_callback;
callbacks_ptrs.destroy = destroy_callback;
callbacks_data = new callbacks_data_t{};
callbacks_data->features = features;
callbacks_data->feature_count = features_found;
callbacks_data->set = (metrics_set->empty()) ? NULL : metrics_set;
callbacks_data->group_index = 0;
callbacks_data->file_handle = result_file_handle;
callbacks_data->gpu_index = (gpu_index_vec->empty()) ? NULL : gpu_index_vec;
callbacks_data->kernel_string = (kernel_string_vec->empty()) ? NULL : kernel_string_vec;
callbacks_data->range = (range_vec->empty()) ? NULL : range_vec;
;
callbacks_data->filter_on = (callbacks_data->gpu_index != NULL) ||
(callbacks_data->kernel_string != NULL) || (callbacks_data->range != NULL)
? 1
: 0;
rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_data);
}
xml::Xml::Destroy(xml);
if (CTX_OUTSTANDING_MON != 0) {
pthread_t thread;
pthread_attr_t attr;
int err = pthread_attr_init(&attr);
if (err) {
errno = err;
perror("pthread_attr_init");
abort();
}
err = pthread_create(&thread, &attr, monitor_thr_fun, NULL);
}
ONLOAD_TRACE_END();
}
// Tool destructor
void rocprofiler_unload(bool is_destr) {
ONLOAD_TRACE("begin loaded(" << is_loaded << ") destr(" << is_destr << ")");
if (pthread_mutex_lock(&mutex) != 0) {
perror("pthread_mutex_lock");
abort();
}
if (!is_loaded) return;
is_loaded = false;
if (pthread_mutex_unlock(&mutex) != 0) {
perror("pthread_mutex_unlock");
abort();
}
// Unregister dispatch callback
rocprofiler_remove_queue_callbacks();
// Dump stored profiling output data
fflush(stdout);
if (result_file_opened) {
printf("\nROCPRofiler:");
fflush(stdout);
if (CTX_OUTSTANDING_WAIT == 1) dump_context_array(NULL);
fclose(result_file_handle);
printf(" %u contexts collected, output directory %s\n", context_collected, result_prefix);
} else {
if (context_collected != context_count) {
results_output_break();
if (CTX_OUTSTANDING_WAIT == 1) dump_context_array(NULL);
}
printf("\nROCPRofiler: %u contexts collected\n", context_collected);
}
fflush(stdout);
// Cleanup
if (callbacks_data != NULL) {
delete[] callbacks_data->features;
delete callbacks_data;
callbacks_data = NULL;
}
delete metrics_set;
metrics_set = NULL;
delete gpu_index_vec;
gpu_index_vec = NULL;
delete kernel_string_vec;
kernel_string_vec = NULL;
delete range_vec;
range_vec = NULL;
delete context_array;
context_array = NULL;
ONLOAD_TRACE_END();
}
extern "C" PUBLIC_API void OnUnloadTool() {
ONLOAD_TRACE("begin loaded(" << is_loaded << ")");
if (is_loaded == true) rocprofiler_unload(false);
ONLOAD_TRACE_END();
}
extern "C" DESTRUCTOR_API void destructor() {
ONLOAD_TRACE("begin loaded(" << is_loaded << ")");
if (is_loaded == true) rocprofiler_unload(true);
ONLOAD_TRACE_END();
}