Public C++ header files and samples updates (#819)

* Public C++ header files (source/include/rocprofiler-sdk/cxx)

* Update samples/api_buffered_tracing

- scratch memory and page migration
- README

* Update samples/api_buffered_tracing

- page migration component in sample

* Update tests/page-migration/validate.py

- fix checks for page migration operation names

* Update tests/page-migration/validate.py

- fix get_allocated_pages

* Update scratch memory and page migration validations

* Fix include/rocprofiler-sdk/cxx installation

* Rework include/rocprofiler-sdk/cxx

- Improve name_info to support const char*, string_view, string

* Update samples/api_{buffered,callback}_tracing

* External correlation ID request sample

- includes correlation ID retirement demo

* Update samples/api_buffered_tracing/README.md

* Update lib/rocprofiler-sdk/hsa/queue.cpp

- generate correlation ID for kernel launch if one doesn't exist

* Remove priority check from tool libraries (samples/tests)

- if(priority > 0) return nullptr check in rocprofiler_configure has proliferated beyond its intended use

* Apply suggestions from code review

[ROCm/rocprofiler-sdk commit: de13d2ac5d]
Esse commit está contido em:
Jonathan R. Madsen
2024-04-25 20:09:11 -05:00
commit de GitHub
commit f167317524
37 arquivos alterados com 2232 adições e 512 exclusões
@@ -29,4 +29,5 @@ add_subdirectory(api_buffered_tracing)
add_subdirectory(code_object_tracing)
add_subdirectory(counter_collection)
add_subdirectory(intercept_table)
add_subdirectory(external_correlation_id_request)
# add_subdirectory(code_object_isa_decode) add_subdirectory(advanced_thread_trace)
@@ -543,9 +543,6 @@ rocprofiler_configure(uint32_t version,
uint32_t priority,
rocprofiler_client_id_t* id)
{
// only activate if main tool
if(priority > 0) return nullptr;
// set the client name
id->name = "Adv_Thread_Trace_Sample";
@@ -559,8 +556,8 @@ rocprofiler_configure(uint32_t version,
// generate info string
auto info = std::stringstream{};
info << id->name << " is using rocprofiler-sdk v" << major << "." << minor << "." << patch
<< " (" << runtime_version << ")";
info << id->name << " (priority=" << priority << ") is using rocprofiler-sdk v" << major << "."
<< minor << "." << patch << " (" << runtime_version << ")";
std::clog << info.str() << std::endl;
@@ -0,0 +1,18 @@
# API Buffer Tracing Sample
## Services
- Code object callback tracing for mapping kernel IDs to kernel names
- HSA API (Core, AMD Ext)
- HIP API (Runtime)
- Kernel dispatch
- Memory copy
- Page Migration
- Scratch Memory
## Properties
- Buffer size of 4096 bytes which is automatically flushed once >= 87.5% of buffer is filled (3584 bytes)
- Creation of dedicated thread for buffer callback delivery
- Push external correlation IDs once per thread (value is thread ID)
- Receives notifications for internal thread creation
@@ -50,6 +50,7 @@
#include <atomic>
#include <cassert>
#include <chrono>
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <cstdio>
@@ -148,21 +149,29 @@ tool_tracing_callback(rocprofiler_context_id_t context,
{
auto* header = headers[i];
if(header == nullptr)
auto kind_name = std::string{};
if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING)
{
throw std::runtime_error{
"rocprofiler provided a null pointer to header. this should never happen"};
const char* _name = nullptr;
auto _kind = static_cast<rocprofiler_buffer_tracing_kind_t>(header->kind);
ROCPROFILER_CALL(rocprofiler_query_buffer_tracing_kind_name(_kind, &_name, nullptr),
"query buffer tracing kind name");
if(_name)
{
static size_t len = 15;
kind_name = std::string{_name};
len = std::max(len, kind_name.length());
kind_name.resize(len, ' ');
kind_name += " :: ";
}
}
else if(header->hash !=
rocprofiler_record_header_compute_hash(header->category, header->kind))
{
throw std::runtime_error{"rocprofiler_record_header_t (category | kind) != hash"};
}
else if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING &&
(header->kind == ROCPROFILER_BUFFER_TRACING_HSA_CORE_API ||
header->kind == ROCPROFILER_BUFFER_TRACING_HSA_AMD_EXT_API ||
header->kind == ROCPROFILER_BUFFER_TRACING_HSA_IMAGE_EXT_API ||
header->kind == ROCPROFILER_BUFFER_TRACING_HSA_FINALIZE_EXT_API))
if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING &&
(header->kind == ROCPROFILER_BUFFER_TRACING_HSA_CORE_API ||
header->kind == ROCPROFILER_BUFFER_TRACING_HSA_AMD_EXT_API ||
header->kind == ROCPROFILER_BUFFER_TRACING_HSA_IMAGE_EXT_API ||
header->kind == ROCPROFILER_BUFFER_TRACING_HSA_FINALIZE_EXT_API))
{
auto* record =
static_cast<rocprofiler_buffer_tracing_hsa_api_record_t*>(header->payload);
@@ -173,7 +182,7 @@ tool_tracing_callback(rocprofiler_context_id_t context,
<< ", extern_cid=" << record->correlation_id.external.value
<< ", kind=" << record->kind << ", operation=" << record->operation
<< ", start=" << record->start_timestamp << ", stop=" << record->end_timestamp
<< ", name=" << client_name_info.operation_names[record->kind][record->operation];
<< ", name=" << client_name_info.at(record->kind, record->operation);
if(record->start_timestamp > record->end_timestamp)
{
@@ -186,7 +195,7 @@ tool_tracing_callback(rocprofiler_context_id_t context,
}
static_cast<call_stack_t*>(user_data)->emplace_back(
source_location{__FUNCTION__, __FILE__, __LINE__, info.str()});
source_location{__FUNCTION__, __FILE__, __LINE__, kind_name + info.str()});
}
else if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING &&
header->kind == ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API)
@@ -200,7 +209,7 @@ tool_tracing_callback(rocprofiler_context_id_t context,
<< ", extern_cid=" << record->correlation_id.external.value
<< ", kind=" << record->kind << ", operation=" << record->operation
<< ", start=" << record->start_timestamp << ", stop=" << record->end_timestamp
<< ", name=" << client_name_info.operation_names[record->kind][record->operation];
<< ", name=" << client_name_info[record->kind][record->operation];
if(record->start_timestamp > record->end_timestamp)
{
@@ -213,7 +222,7 @@ tool_tracing_callback(rocprofiler_context_id_t context,
}
static_cast<call_stack_t*>(user_data)->emplace_back(
source_location{__FUNCTION__, __FILE__, __LINE__, info.str()});
source_location{__FUNCTION__, __FILE__, __LINE__, kind_name + info.str()});
}
else if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING &&
header->kind == ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH)
@@ -223,15 +232,16 @@ tool_tracing_callback(rocprofiler_context_id_t context,
auto info = std::stringstream{};
info << "agent_id=" << record->dispatch_info.agent_id.handle
info << "tid=" << record->thread_id << ", context=" << context.handle
<< ", buffer_id=" << buffer_id.handle
<< ", cid=" << record->correlation_id.internal
<< ", extern_cid=" << record->correlation_id.external.value
<< ", kind=" << record->kind << ", operation=" << record->operation
<< ", agent_id=" << record->dispatch_info.agent_id.handle
<< ", queue_id=" << record->dispatch_info.queue_id.handle
<< ", kernel_id=" << record->dispatch_info.kernel_id
<< ", kernel=" << client_kernels.at(record->dispatch_info.kernel_id).kernel_name
<< ", context=" << context.handle << ", buffer_id=" << buffer_id.handle
<< ", cid=" << record->correlation_id.internal
<< ", extern_cid=" << record->correlation_id.external.value
<< ", kind=" << record->kind << ", start=" << record->start_timestamp
<< ", stop=" << record->end_timestamp
<< ", start=" << record->start_timestamp << ", stop=" << record->end_timestamp
<< ", private_segment_size=" << record->dispatch_info.private_segment_size
<< ", group_segment_size=" << record->dispatch_info.group_segment_size
<< ", workgroup_size=(" << record->dispatch_info.workgroup_size.x << ","
@@ -244,7 +254,7 @@ tool_tracing_callback(rocprofiler_context_id_t context,
throw std::runtime_error("kernel dispatch: start > end");
static_cast<call_stack_t*>(user_data)->emplace_back(
source_location{__FUNCTION__, __FILE__, __LINE__, info.str()});
source_location{__FUNCTION__, __FILE__, __LINE__, kind_name + info.str()});
}
else if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING &&
header->kind == ROCPROFILER_BUFFER_TRACING_MEMORY_COPY)
@@ -254,20 +264,111 @@ tool_tracing_callback(rocprofiler_context_id_t context,
auto info = std::stringstream{};
info << "src_agent_id=" << record->src_agent_id.handle
<< ", dst_agent_id=" << record->dst_agent_id.handle
<< ", direction=" << record->operation << ", context=" << context.handle
info << "tid=" << record->thread_id << ", context=" << context.handle
<< ", buffer_id=" << buffer_id.handle
<< ", cid=" << record->correlation_id.internal
<< ", extern_cid=" << record->correlation_id.external.value
<< ", kind=" << record->kind << ", start=" << record->start_timestamp
<< ", stop=" << record->end_timestamp;
<< ", kind=" << record->kind << ", operation=" << record->operation
<< ", src_agent_id=" << record->src_agent_id.handle
<< ", dst_agent_id=" << record->dst_agent_id.handle
<< ", direction=" << record->operation << ", start=" << record->start_timestamp
<< ", stop=" << record->end_timestamp
<< ", name=" << client_name_info.at(record->kind, record->operation);
if(record->start_timestamp > record->end_timestamp)
throw std::runtime_error("memory copy: start > end");
static_cast<call_stack_t*>(user_data)->emplace_back(
source_location{__FUNCTION__, __FILE__, __LINE__, info.str()});
source_location{__FUNCTION__, __FILE__, __LINE__, kind_name + info.str()});
}
else if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING &&
header->kind == ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION)
{
auto* record =
static_cast<rocprofiler_buffer_tracing_page_migration_record_t*>(header->payload);
auto info = std::stringstream{};
info << "kind=" << record->kind << ", operation=" << record->operation
<< ", pid=" << record->pid << ", start=" << record->start_timestamp
<< ", stop=" << record->end_timestamp
<< ", name=" << client_name_info.at(record->kind, record->operation);
switch(record->operation)
{
case ROCPROFILER_PAGE_MIGRATION_PAGE_MIGRATE:
{
info << ", page_fault=(" << record->page_fault.read_fault << ", "
<< record->page_fault.migrated << ", " << record->page_fault.node_id
<< ", " << std::hex << "0x" << record->page_fault.address << ")";
break;
}
case ROCPROFILER_PAGE_MIGRATION_PAGE_FAULT:
{
info << ", page_migrate=(" << std::hex << "0x"
<< record->page_migrate.start_addr << ", 0x"
<< record->page_migrate.end_addr << ", " << std::dec
<< record->page_migrate.from_node << ", " << record->page_migrate.to_node
<< ", " << record->page_migrate.prefetch_node << ", "
<< record->page_migrate.preferred_node << ", "
<< record->page_migrate.trigger << ")";
break;
}
case ROCPROFILER_PAGE_MIGRATION_QUEUE_SUSPEND:
{
info << ", queue_suspend=(" << record->queue_suspend.rescheduled << ", "
<< record->queue_suspend.node_id << ", " << record->queue_suspend.trigger
<< ")";
break;
}
case ROCPROFILER_PAGE_MIGRATION_UNMAP_FROM_GPU:
{
info << ", unmap_from_gpu=(" << record->unmap_from_gpu.node_id << std::hex
<< ", 0x" << record->unmap_from_gpu.start_addr << ", 0x"
<< record->unmap_from_gpu.end_addr << ", " << std::dec
<< record->unmap_from_gpu.trigger << ")";
break;
}
case ROCPROFILER_PAGE_MIGRATION_NONE:
case ROCPROFILER_PAGE_MIGRATION_LAST:
{
throw std::runtime_error{"unexpected page migration value"};
break;
}
}
if(record->start_timestamp > record->end_timestamp)
throw std::runtime_error("page migration: start > end");
static_cast<call_stack_t*>(user_data)->emplace_back(
source_location{__FUNCTION__, __FILE__, __LINE__, kind_name + info.str()});
}
else if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING &&
header->kind == ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY)
{
auto* record =
static_cast<rocprofiler_buffer_tracing_scratch_memory_record_t*>(header->payload);
auto info = std::stringstream{};
auto _elapsed =
std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(
std::chrono::nanoseconds{record->end_timestamp - record->start_timestamp})
.count();
info << "tid=" << record->thread_id << ", context=" << context.handle
<< ", buffer_id=" << buffer_id.handle
<< ", cid=" << record->correlation_id.internal
<< ", extern_cid=" << record->correlation_id.external.value
<< ", kind=" << record->kind << ", operation=" << record->operation
<< ", agent_id=" << record->agent_id.handle
<< ", queue_id=" << record->queue_id.handle << ", thread_id=" << record->thread_id
<< ", elapsed=" << std::setprecision(3) << std::fixed << _elapsed
<< " usec, flags=" << record->flags
<< ", name=" << client_name_info.at(record->kind, record->operation);
static_cast<call_stack_t*>(user_data)->emplace_back(
source_location{__FUNCTION__, __FILE__, __LINE__, kind_name + info.str()});
}
else
{
@@ -312,25 +413,25 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
client_name_info = common::get_buffer_tracing_names();
for(const auto& itr : client_name_info.operation_names)
for(const auto& itr : client_name_info)
{
auto name_idx = std::stringstream{};
name_idx << " [" << std::setw(3) << static_cast<int32_t>(itr.first) << "]";
name_idx << " [" << std::setw(3) << itr.value << "]";
call_stack_v->emplace_back(
source_location{"rocprofiler_buffer_tracing_kind_names " + name_idx.str(),
__FILE__,
__LINE__,
client_name_info.kind_names.at(itr.first)});
std::string{itr.name}});
for(const auto& ditr : itr.second)
for(auto [didx, ditr] : itr.items())
{
auto operation_idx = std::stringstream{};
operation_idx << " [" << std::setw(3) << static_cast<int32_t>(ditr.first) << "]";
operation_idx << " [" << std::setw(3) << didx << "]";
call_stack_v->emplace_back(source_location{
"rocprofiler_buffer_tracing_kind_operation_names" + operation_idx.str(),
__FILE__,
__LINE__,
std::string{"- "} + std::string{ditr.second}});
std::string{"- "} + std::string{*ditr}});
}
}
@@ -338,28 +439,32 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
ROCPROFILER_CALL(rocprofiler_create_context(&client_ctx), "context creation");
auto code_object_ops = std::vector<rocprofiler_tracing_operation_t>{
ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER};
ROCPROFILER_CALL(
rocprofiler_configure_callback_tracing_service(client_ctx,
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT,
nullptr,
0,
code_object_ops.data(),
code_object_ops.size(),
tool_code_object_callback,
nullptr),
"code object tracing service configure");
constexpr auto buffer_size_bytes = 4096;
constexpr auto buffer_watermark_bytes = buffer_size_bytes - (buffer_size_bytes / 8);
ROCPROFILER_CALL(rocprofiler_create_buffer(client_ctx,
4096,
2048,
buffer_size_bytes,
buffer_watermark_bytes,
ROCPROFILER_BUFFER_POLICY_LOSSLESS,
tool_tracing_callback,
tool_data,
&client_buffer),
"buffer creation");
for(auto itr : {ROCPROFILER_BUFFER_TRACING_HSA_CORE_API,
ROCPROFILER_BUFFER_TRACING_HSA_AMD_EXT_API,
ROCPROFILER_BUFFER_TRACING_HSA_IMAGE_EXT_API,
ROCPROFILER_BUFFER_TRACING_HSA_FINALIZE_EXT_API})
for(auto itr :
{ROCPROFILER_BUFFER_TRACING_HSA_CORE_API, ROCPROFILER_BUFFER_TRACING_HSA_AMD_EXT_API})
{
ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service(
client_ctx, itr, nullptr, 0, client_buffer),
@@ -381,6 +486,15 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
client_ctx, ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, nullptr, 0, client_buffer),
"buffer tracing service for memory copy configure");
// May have incompatible kernel so only emit a warning here
ROCPROFILER_WARN(rocprofiler_configure_buffer_tracing_service(
client_ctx, ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION, nullptr, 0, client_buffer));
ROCPROFILER_CALL(
rocprofiler_configure_buffer_tracing_service(
client_ctx, ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY, nullptr, 0, client_buffer),
"buffer tracing service for page migration configure");
auto client_thread = rocprofiler_callback_thread_t{};
ROCPROFILER_CALL(rocprofiler_create_callback_thread(&client_thread),
"creating callback thread");
@@ -469,9 +583,6 @@ rocprofiler_configure(uint32_t version,
uint32_t priority,
rocprofiler_client_id_t* id)
{
// only activate if main tool
if(priority > 0) return nullptr;
// set the client name
id->name = "ExampleTool";
@@ -485,8 +596,8 @@ rocprofiler_configure(uint32_t version,
// generate info string
auto info = std::stringstream{};
info << id->name << " is using rocprofiler-sdk v" << major << "." << minor << "." << patch
<< " (" << runtime_version << ")";
info << id->name << " (priority=" << priority << ") is using rocprofiler-sdk v" << major << "."
<< minor << "." << patch << " (" << runtime_version << ")";
std::clog << info.str() << std::endl;
@@ -30,6 +30,7 @@
#include <iostream>
#include <mutex>
#include <random>
#include <sstream>
#include <stdexcept>
#define HIP_API_CALL(CALL) \
@@ -51,7 +52,7 @@ namespace
{
using auto_lock_t = std::unique_lock<std::mutex>;
auto print_lock = std::mutex{};
size_t nthreads = 2;
size_t nthread_per_device = 2;
size_t nitr = 500;
size_t nsync = 10;
constexpr unsigned shared_mem_tile_dim = 32;
@@ -64,10 +65,19 @@ verify(int* in, int* out, int M, int N);
} // namespace
__global__ void
transpose_a(const int* in, int* out, int M, int N);
transpose(const int* in, int* out, int M, int N);
void
run(int rank, int tid, hipStream_t stream, int argc, char** argv);
run(int rank, int tid, int devid, int argc, char** argv);
void
run_transpose(int rank, int tid, hipStream_t stream, int argc, char** argv);
void
run_migrate(int rank, int tid, hipStream_t stream, int, char** argv);
void
run_scratch(int rank, int tid, hipStream_t stream, int argc, char** argv);
int
main(int argc, char** argv)
@@ -76,6 +86,8 @@ main(int argc, char** argv)
client::start(); // starts context before any API tables are available
client::identify(1);
auto* exe_name = ::basename(argv[0]);
int rank = 0;
for(int i = 1; i < argc; ++i)
{
@@ -83,47 +95,38 @@ main(int argc, char** argv)
if(_arg == "?" || _arg == "-h" || _arg == "--help")
{
fprintf(stderr,
"usage: transpose [NUM_THREADS (%zu)] [NUM_ITERATION (%zu)] "
"usage: %s [NUM_THREADS_PER_DEVICE (%zu)] [NUM_ITERATION (%zu)] "
"[SYNC_EVERY_N_ITERATIONS (%zu)]\n",
nthreads,
exe_name,
nthread_per_device,
nitr,
nsync);
exit(EXIT_SUCCESS);
}
}
if(argc > 1) nthreads = atoll(argv[1]);
if(argc > 1) nthread_per_device = atoll(argv[1]);
if(argc > 2) nitr = atoll(argv[2]);
if(argc > 3) nsync = atoll(argv[3]);
printf("[transpose] Number of threads: %zu\n", nthreads);
printf("[transpose] Number of iterations: %zu\n", nitr);
printf("[transpose] Syncing every %zu iterations\n", nsync);
// this is a temporary workaround in omnitrace when HIP + MPI is enabled
int ndevice = 0;
int devid = rank;
HIP_API_CALL(hipGetDeviceCount(&ndevice));
printf("[transpose] Number of devices found: %i\n", ndevice);
if(ndevice > 0)
auto nthreads = (ndevice * nthread_per_device);
printf("[%s] Number of devices found: %i\n", exe_name, ndevice);
printf("[%s] Number of threads (per device): %zu\n", exe_name, nthread_per_device);
printf("[%s] Number of threads (total): %zu\n", exe_name, nthreads);
printf("[%s] Number of iterations: %zu\n", exe_name, nitr);
printf("[%s] Syncing every %zu iterations\n", exe_name, nsync);
{
devid = rank % ndevice;
HIP_API_CALL(hipSetDevice(devid));
printf("[transpose] Rank %i assigned to device %i\n", rank, devid);
}
if(rank == devid && rank < ndevice)
{
std::vector<std::thread> _threads{};
std::vector<hipStream_t> _streams(nthreads);
auto _threads = std::vector<std::thread>{};
for(size_t i = 0; i < nthreads; ++i)
HIP_API_CALL(hipStreamCreate(&_streams.at(i)));
for(size_t i = 1; i < nthreads; ++i)
_threads.emplace_back(run, rank, i, _streams.at(i), argc, argv);
run(rank, 0, _streams.at(0), argc, argv);
_threads.emplace_back(run, rank, i, i % ndevice, argc, argv);
for(auto& itr : _threads)
itr.join();
for(size_t i = 0; i < nthreads; ++i)
HIP_API_CALL(hipStreamDestroy(_streams.at(i)));
}
HIP_API_CALL(hipDeviceSynchronize());
HIP_API_CALL(hipDeviceReset());
@@ -134,7 +137,7 @@ main(int argc, char** argv)
}
__global__ void
transpose_a(const int* in, int* out, int M, int N)
transpose(const int* in, int* out, int M, int N)
{
__shared__ int tile[shared_mem_tile_dim][shared_mem_tile_dim];
@@ -145,17 +148,91 @@ transpose_a(const int* in, int* out, int M, int N)
out[idx] = tile[threadIdx.x][threadIdx.y];
}
template <typename Tp>
__global__ void
test_page_migrate(Tp* data, Tp val)
{
int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
data[idx] += val;
}
__global__ void
test_kern_large(uint64_t* output)
{
uint64_t result = 0;
int test[4000];
memset(test, 5, 4000);
for(int& i : test)
{
i = i + 7;
*output += i;
result += i;
}
*output ^= result;
*output ^= result;
}
__global__ void
test_kern_medium(uint64_t* output)
{
uint64_t result = 0;
int test[175];
memset(test, 5, 175);
for(int& i : test)
{
i = i + 7;
*output += i;
result += i;
}
*output ^= result;
*output ^= result;
}
__global__ void
test_kern_small(uint64_t* output)
{
uint64_t result = 0;
int test[2];
for(int& i : test)
{
i = i + 7;
*output += i;
result += i;
}
*output ^= result;
*output ^= result;
}
void
run(int rank, int tid, hipStream_t stream, int argc, char** argv)
run(int rank, int tid, int devid, int argc, char** argv)
{
client::identify(tid + 1);
auto* stream = hipStream_t{};
HIP_API_CALL(hipSetDevice(devid));
HIP_API_CALL(hipStreamCreate(&stream));
run_migrate(rank, tid, stream, argc, argv);
run_scratch(rank, tid, stream, argc, argv);
run_transpose(rank, tid, stream, argc, argv);
HIP_API_CALL(hipStreamSynchronize(stream));
HIP_API_CALL(hipStreamDestroy(stream));
}
void
run_transpose(int rank, int tid, hipStream_t stream, int argc, char** argv)
{
auto* exe_name = ::basename(argv[0]);
unsigned int M = 4960 * 2;
unsigned int N = 4960 * 2;
if(argc > 2) nitr = atoll(argv[2]);
if(argc > 3) nsync = atoll(argv[3]);
auto_lock_t _lk{print_lock};
std::cout << "[transpose][" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl;
std::cout << "[" << exe_name << "][transpose][" << rank << "][" << tid << "] M: " << M
<< " N: " << N << std::endl;
_lk.unlock();
std::default_random_engine _engine{std::random_device{}() * (rank + 1) * (tid + 1)};
@@ -180,10 +257,11 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv)
HIP_API_CALL(hipStreamSynchronize(stream));
dim3 grid(M / 32, N / 32, 1);
dim3 block(32, 32, 1); // transpose_a
dim3 block(32, 32, 1); // transpose
print_lock.lock();
printf("[transpose][%i][%i] grid=(%i,%i,%i), block=(%i,%i,%i)\n",
printf("[%s][transpose][%i][%i] grid=(%i,%i,%i), block=(%i,%i,%i)\n",
exe_name,
rank,
tid,
grid.x,
@@ -197,7 +275,7 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv)
auto t1 = std::chrono::high_resolution_clock::now();
for(size_t i = 0; i < nitr; ++i)
{
transpose_a<<<grid, block, 0, stream>>>(in, out, M, N);
transpose<<<grid, block, 0, stream>>>(in, out, M, N);
check_hip_error();
if(i % nsync == (nsync - 1)) HIP_API_CALL(hipStreamSynchronize(stream));
}
@@ -208,9 +286,9 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv)
float GB = (float) size * nitr * 2 / (1 << 30);
print_lock.lock();
std::cout << "[transpose][" << rank << "][" << tid << "] Runtime of transpose is " << time
<< " sec\n";
std::cout << "[transpose][" << rank << "][" << tid
std::cout << "[" << exe_name << "][transpose][" << rank << "][" << tid
<< "] Runtime of transpose is " << time << " sec\n";
std::cout << "[" << exe_name << "][transpose][" << rank << "][" << tid
<< "] The average performance of transpose is " << GB / time << " GBytes/sec"
<< std::endl;
print_lock.unlock();
@@ -227,6 +305,92 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv)
delete[] out_matrix;
}
void
run_scratch(int rank, int tid, hipStream_t stream, int, char** argv)
{
auto t1 = std::chrono::high_resolution_clock::now();
HIP_API_CALL(hipStreamSynchronize(stream));
const auto* exe_name = ::basename(argv[0]);
uint64_t* data_ptr = nullptr;
HIP_API_CALL(hipHostMalloc(&data_ptr, sizeof(uint64_t), 0));
*data_ptr = 0;
test_kern_small<<<1000, 1, 0, stream>>>(data_ptr);
test_kern_medium<<<1000, 1, 0, stream>>>(data_ptr);
test_kern_small<<<1000, 1, 0, stream>>>(data_ptr);
test_kern_large<<<1100, 1, 0, stream>>>(data_ptr);
HIP_API_CALL(hipStreamSynchronize(stream));
test_kern_small<<<1000, 1, 0, stream>>>(data_ptr);
HIP_API_CALL(hipStreamSynchronize(stream));
test_kern_medium<<<1000, 1, 0, stream>>>(data_ptr);
HIP_API_CALL(hipStreamSynchronize(stream));
test_kern_small<<<1000, 1, 0, stream>>>(data_ptr);
HIP_API_CALL(hipStreamSynchronize(stream));
test_kern_large<<<1100, 1, 0, stream>>>(data_ptr);
HIP_API_CALL(hipStreamSynchronize(stream));
auto t2 = std::chrono::high_resolution_clock::now();
double time = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count();
print_lock.lock();
std::cout << "[" << exe_name << "][scratch][" << rank << "][" << tid
<< "] Runtime of scratch is " << time << " sec\n";
print_lock.unlock();
}
void
run_migrate(int rank, int tid, hipStream_t stream, int, char** argv)
{
using data_type = uint64_t;
constexpr data_type init_v = 1;
constexpr data_type incr_v = 1;
auto t1 = std::chrono::high_resolution_clock::now();
HIP_API_CALL(hipStreamSynchronize(stream));
const auto* exe_name = ::basename(argv[0]);
auto page_data = std::vector<data_type>(1024, 0);
HIP_API_CALL(hipHostRegister(
page_data.data(), page_data.size() * sizeof(data_type), hipHostRegisterDefault));
for(auto& itr : page_data)
itr = init_v;
test_page_migrate<<<1, 1024, 0, stream>>>(page_data.data(), incr_v);
HIP_API_CALL(hipStreamSynchronize(stream));
for(auto& itr : page_data)
{
auto diff = (itr - incr_v);
if(diff != init_v)
{
auto msg = std::stringstream{};
msg << "invalid diff: " << diff << ". expected: " << init_v;
throw std::runtime_error{msg.str()};
}
}
HIP_API_CALL(hipHostUnregister(page_data.data()));
auto t2 = std::chrono::high_resolution_clock::now();
double time = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count();
print_lock.lock();
std::cout << "[" << exe_name << "][migrate][" << rank << "][" << tid
<< "] Runtime of migrate is " << time << " sec\n";
print_lock.unlock();
}
namespace
{
void
@@ -191,25 +191,25 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
callback_name_info name_info = common::get_callback_id_names();
for(const auto& itr : name_info.operation_names)
for(const auto& itr : name_info)
{
auto name_idx = std::stringstream{};
name_idx << " [" << std::setw(3) << static_cast<int32_t>(itr.first) << "]";
name_idx << " [" << std::setw(3) << itr.value << "]";
call_stack_v->emplace_back(
source_location{"rocprofiler_callback_tracing_kind_names " + name_idx.str(),
__FILE__,
__LINE__,
name_info.kind_names.at(itr.first)});
std::string{itr.name}});
for(const auto& ditr : itr.second)
for(auto [didx, ditr] : itr.items())
{
auto operation_idx = std::stringstream{};
operation_idx << " [" << std::setw(3) << static_cast<int32_t>(ditr.first) << "]";
operation_idx << " [" << std::setw(3) << didx << "]";
call_stack_v->emplace_back(source_location{
"rocprofiler_callback_tracing_kind_operation_names" + operation_idx.str(),
__FILE__,
__LINE__,
std::string{"- "} + std::string{ditr.second}});
std::string{"- "} + std::string{*ditr}});
}
}
@@ -322,9 +322,6 @@ rocprofiler_configure(uint32_t version,
uint32_t priority,
rocprofiler_client_id_t* id)
{
// only activate if main tool
if(priority > 0) return nullptr;
// set the client name
id->name = "ExampleTool";
@@ -338,8 +335,8 @@ rocprofiler_configure(uint32_t version,
// generate info string
auto info = std::stringstream{};
info << id->name << " is using rocprofiler-sdk v" << major << "." << minor << "." << patch
<< " (" << runtime_version << ")";
info << id->name << " (priority=" << priority << ") is using rocprofiler-sdk v" << major << "."
<< minor << "." << patch << " (" << runtime_version << ")";
std::clog << info.str() << std::endl;
@@ -217,9 +217,6 @@ rocprofiler_configure(uint32_t version,
uint32_t priority,
rocprofiler_client_id_t* id)
{
// only activate if main tool
if(priority > 0) return nullptr;
// set the client name
id->name = "ExampleTool";
@@ -233,8 +230,8 @@ rocprofiler_configure(uint32_t version,
// generate info string
auto info = std::stringstream{};
info << id->name << " is using rocprofiler-sdk v" << major << "." << minor << "." << patch
<< " (" << runtime_version << ")";
info << id->name << " (priority=" << priority << ") is using rocprofiler-sdk v" << major << "."
<< minor << "." << patch << " (" << runtime_version << ")";
std::clog << info.str() << std::endl;
@@ -352,9 +352,6 @@ rocprofiler_configure(uint32_t version,
uint32_t priority,
rocprofiler_client_id_t* id)
{
// only activate if main tool
if(priority > 0) return nullptr;
// set the client name
id->name = "ExampleTool";
@@ -368,8 +365,8 @@ rocprofiler_configure(uint32_t version,
// generate info string
auto info = std::stringstream{};
info << id->name << " is using rocprofiler-sdk v" << major << "." << minor << "." << patch
<< " (" << runtime_version << ")";
info << id->name << " (priority=" << priority << ") is using rocprofiler-sdk v" << major << "."
<< minor << "." << patch << " (" << runtime_version << ")";
std::clog << info.str() << std::endl;
@@ -22,6 +22,31 @@
#pragma once
#define ROCPROFILER_WARN(result) \
{ \
rocprofiler_status_t CHECKSTATUS = result; \
if(CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS) \
{ \
std::string status_msg = rocprofiler_get_status_string(CHECKSTATUS); \
std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] " << #result \
<< " returned error code " << CHECKSTATUS << ": " << status_msg \
<< ". This is just a warning!" << std::endl; \
} \
}
#define ROCPROFILER_CHECK(result) \
{ \
rocprofiler_status_t CHECKSTATUS = result; \
if(CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS) \
{ \
std::string status_msg = rocprofiler_get_status_string(CHECKSTATUS); \
std::stringstream errmsg{}; \
errmsg << "[" << __FILE__ << ":" << __LINE__ << "] " << #result \
<< " failed with error code " << CHECKSTATUS << " :: " << status_msg; \
throw std::runtime_error(errmsg.str()); \
} \
}
#define ROCPROFILER_CALL(result, msg) \
{ \
rocprofiler_status_t CHECKSTATUS = result; \
@@ -24,6 +24,7 @@
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/rocprofiler.h>
#include <rocprofiler-sdk/cxx/name_info.hpp>
#include "defines.hpp"
@@ -37,146 +38,18 @@
namespace common
{
using buffer_kind_names_t = std::map<rocprofiler_buffer_tracing_kind_t, const char*>;
using buffer_kind_operation_names_t =
std::map<rocprofiler_buffer_tracing_kind_t, std::map<uint32_t, const char*>>;
using callback_kind_names_t = std::map<rocprofiler_callback_tracing_kind_t, const char*>;
using callback_kind_operation_names_t =
std::map<rocprofiler_callback_tracing_kind_t, std::map<uint32_t, const char*>>;
using callback_name_info = rocprofiler::sdk::callback_name_info;
using buffer_name_info = rocprofiler::sdk::buffer_name_info;
struct buffer_name_info
{
buffer_kind_names_t kind_names = {};
buffer_kind_operation_names_t operation_names = {};
};
struct callback_name_info
{
callback_kind_names_t kind_names = {};
callback_kind_operation_names_t operation_names = {};
};
inline buffer_name_info
inline auto
get_buffer_tracing_names()
{
static const auto supported_kinds = std::unordered_set<rocprofiler_buffer_tracing_kind_t>{
ROCPROFILER_BUFFER_TRACING_HSA_CORE_API,
ROCPROFILER_BUFFER_TRACING_HSA_AMD_EXT_API,
ROCPROFILER_BUFFER_TRACING_HSA_IMAGE_EXT_API,
ROCPROFILER_BUFFER_TRACING_HSA_FINALIZE_EXT_API,
ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API,
ROCPROFILER_BUFFER_TRACING_HIP_COMPILER_API,
ROCPROFILER_BUFFER_TRACING_MARKER_CORE_API,
ROCPROFILER_BUFFER_TRACING_MARKER_CONTROL_API,
ROCPROFILER_BUFFER_TRACING_MARKER_NAME_API,
ROCPROFILER_BUFFER_TRACING_MEMORY_COPY,
};
auto cb_name_info = buffer_name_info{};
//
// callback for each kind operation
//
static auto tracing_kind_operation_cb =
[](rocprofiler_buffer_tracing_kind_t kindv, uint32_t operation, void* data_v) {
auto* name_info_v = static_cast<buffer_name_info*>(data_v);
if(supported_kinds.count(kindv) > 0)
{
const char* name = nullptr;
ROCPROFILER_CALL(rocprofiler_query_buffer_tracing_kind_operation_name(
kindv, operation, &name, nullptr),
"query buffer tracing kind operation name");
if(name) name_info_v->operation_names[kindv][operation] = name;
}
return 0;
};
//
// callback for each buffer kind (i.e. domain)
//
static auto tracing_kind_cb = [](rocprofiler_buffer_tracing_kind_t kind, void* data) {
// store the buffer kind name
auto* name_info_v = static_cast<buffer_name_info*>(data);
const char* name = nullptr;
ROCPROFILER_CALL(rocprofiler_query_buffer_tracing_kind_name(kind, &name, nullptr),
"query buffer tracing kind operation name");
if(name) name_info_v->kind_names[kind] = name;
if(supported_kinds.count(kind) > 0)
{
ROCPROFILER_CALL(rocprofiler_iterate_buffer_tracing_kind_operations(
kind, tracing_kind_operation_cb, static_cast<void*>(data)),
"iterating buffer tracing kind operations");
}
return 0;
};
ROCPROFILER_CALL(rocprofiler_iterate_buffer_tracing_kinds(tracing_kind_cb,
static_cast<void*>(&cb_name_info)),
"iterating buffer tracing kinds");
return cb_name_info;
return rocprofiler::sdk::get_buffer_tracing_names();
}
inline callback_name_info
inline auto
get_callback_id_names()
{
static auto supported = std::unordered_set<rocprofiler_callback_tracing_kind_t>{
ROCPROFILER_CALLBACK_TRACING_HSA_CORE_API,
ROCPROFILER_CALLBACK_TRACING_HSA_AMD_EXT_API,
ROCPROFILER_CALLBACK_TRACING_HSA_IMAGE_EXT_API,
ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API,
ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API,
ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API,
ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API,
ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API,
ROCPROFILER_CALLBACK_TRACING_MARKER_NAME_API,
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT,
};
auto cb_name_info = callback_name_info{};
//
// callback for each kind operation
//
static auto tracing_kind_operation_cb =
[](rocprofiler_callback_tracing_kind_t kindv, uint32_t operation, void* data_v) {
auto* name_info_v = static_cast<callback_name_info*>(data_v);
if(supported.count(kindv) > 0)
{
const char* name = nullptr;
ROCPROFILER_CALL(rocprofiler_query_callback_tracing_kind_operation_name(
kindv, operation, &name, nullptr),
"query callback tracing kind operation name");
if(name) name_info_v->operation_names[kindv][operation] = name;
}
return 0;
};
//
// callback for each callback kind (i.e. domain)
//
static auto tracing_kind_cb = [](rocprofiler_callback_tracing_kind_t kind, void* data) {
// store the callback kind name
auto* name_info_v = static_cast<callback_name_info*>(data);
const char* name = nullptr;
ROCPROFILER_CALL(rocprofiler_query_callback_tracing_kind_name(kind, &name, nullptr),
"query callback tracing kind operation name");
if(name) name_info_v->kind_names[kind] = name;
if(supported.count(kind) > 0)
{
ROCPROFILER_CALL(rocprofiler_iterate_callback_tracing_kind_operations(
kind, tracing_kind_operation_cb, static_cast<void*>(data)),
"iterating callback tracing kind operations");
}
return 0;
};
ROCPROFILER_CALL(rocprofiler_iterate_callback_tracing_kinds(tracing_kind_cb,
static_cast<void*>(&cb_name_info)),
"iterating callback tracing kinds");
return cb_name_info;
return rocprofiler::sdk::get_callback_tracing_names();
}
} // namespace common
@@ -207,9 +207,9 @@ tool_fini(void* user_data)
} // namespace
extern "C" rocprofiler_tool_configure_result_t*
rocprofiler_configure(uint32_t version,
const char* runtime_version,
uint32_t,
rocprofiler_configure(uint32_t version,
const char* runtime_version,
uint32_t priority,
rocprofiler_client_id_t* id)
{
// set the client name
@@ -222,8 +222,8 @@ rocprofiler_configure(uint32_t version,
// generate info string
auto info = std::stringstream{};
info << id->name << " is using rocprofiler-sdk v" << major << "." << minor << "." << patch
<< " (" << runtime_version << ")";
info << id->name << " (priority=" << priority << ") is using rocprofiler-sdk v" << major << "."
<< minor << "." << patch << " (" << runtime_version << ")";
std::clog << info.str() << std::endl;
@@ -286,9 +286,9 @@ tool_fini(void* user_data)
} // namespace
extern "C" rocprofiler_tool_configure_result_t*
rocprofiler_configure(uint32_t version,
const char* runtime_version,
uint32_t,
rocprofiler_configure(uint32_t version,
const char* runtime_version,
uint32_t priority,
rocprofiler_client_id_t* id)
{
// set the client name
@@ -301,8 +301,8 @@ rocprofiler_configure(uint32_t version,
// generate info string
auto info = std::stringstream{};
info << id->name << " is using rocprofiler-sdk v" << major << "." << minor << "." << patch
<< " (" << runtime_version << ")";
info << id->name << " (priority=" << priority << ") is using rocprofiler-sdk v" << major << "."
<< minor << "." << patch << " (" << runtime_version << ")";
std::clog << info.str() << std::endl;
@@ -415,9 +415,9 @@ tool_fini(void*)
} // namespace
extern "C" rocprofiler_tool_configure_result_t*
rocprofiler_configure(uint32_t version,
const char* runtime_version,
uint32_t,
rocprofiler_configure(uint32_t version,
const char* runtime_version,
uint32_t priority,
rocprofiler_client_id_t* id)
{
// set the client name
@@ -430,8 +430,8 @@ rocprofiler_configure(uint32_t version,
// generate info string
auto info = std::stringstream{};
info << id->name << " is using rocprofiler-sdk v" << major << "." << minor << "." << patch
<< " (" << runtime_version << ")";
info << id->name << " (priority=" << priority << ") is using rocprofiler-sdk v" << major << "."
<< minor << "." << patch << " (" << runtime_version << ")";
std::clog << info.str() << std::endl;
@@ -0,0 +1,59 @@
#
#
#
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
if(NOT CMAKE_HIP_COMPILER)
find_program(
amdclangpp_EXECUTABLE
NAMES amdclang++
HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
PATH_SUFFIXES bin llvm/bin NO_CACHE)
mark_as_advanced(amdclangpp_EXECUTABLE)
if(amdclangpp_EXECUTABLE)
set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}")
endif()
endif()
project(rocprofiler-sdk-samples-external-correlation-id-request LANGUAGES CXX HIP)
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}")
endif()
endforeach()
find_package(rocprofiler-sdk REQUIRED)
add_library(external-correlation-id-request-client SHARED)
target_sources(external-correlation-id-request-client PRIVATE client.cpp client.hpp)
target_link_libraries(
external-correlation-id-request-client
PRIVATE rocprofiler-sdk::rocprofiler-sdk rocprofiler::samples-build-flags
rocprofiler::samples-common-library)
set_source_files_properties(main.cpp PROPERTIES LANGUAGE HIP)
find_package(Threads REQUIRED)
add_executable(external-correlation-id-request)
target_sources(external-correlation-id-request PRIVATE main.cpp)
target_link_libraries(
external-correlation-id-request
PRIVATE external-correlation-id-request-client Threads::Threads
rocprofiler::samples-build-flags)
rocprofiler_samples_get_preload_env(PRELOAD_ENV external-correlation-id-request-client)
rocprofiler_samples_get_ld_library_path_env(LIBRARY_PATH_ENV)
set(external-correlation-id-request-env ${PRELOAD_ENV} ${LIBRARY_PATH_ENV})
add_test(NAME external-correlation-id-request
COMMAND $<TARGET_FILE:external-correlation-id-request>)
set_tests_properties(
external-correlation-id-request
PROPERTIES TIMEOUT 45 LABELS "samples" ENVIRONMENT
"${external-correlation-id-request-env}" FAIL_REGULAR_EXPRESSION
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
@@ -0,0 +1,24 @@
# External Correlation ID Request Sample
## Services
- Code object callback tracing for mapping kernel IDs to kernel names
- HIP Runtime API:
- hipLaunchKernel
- hipMemcpyAsync
- hipMemsetAsync
- hipMalloc
- Kernel dispatch
- Memory Copy
- External correlation ID request:
- Kernel dispatch
- Memory copy
- Correlation ID retirement
## Properties
- Subscribes to an external correlation ID request for all kernel dispatches and async memory copies
- Generates an external correlation ID containing all the arguments passed to the request callback
- Demonstrates that all external correlation IDs which are requested are passed back to tool in buffer callbacks
- Demonstrates that all internal correlation IDs which are provided as an input argument to request are retired
- Buffer size of 4096 bytes which is automatically flushed once >= 87.5% of buffer is filled (3584 bytes)
@@ -0,0 +1,632 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//
// undefine NDEBUG so asserts are implemented
#ifdef NDEBUG
# undef NDEBUG
#endif
/**
* @file samples/api_buffered_tracing/client.cpp
*
* @brief Example rocprofiler client (tool)
*/
#include "client.hpp"
#include <rocprofiler-sdk/buffer.h>
#include <rocprofiler-sdk/buffer_tracing.h>
#include <rocprofiler-sdk/callback_tracing.h>
#include <rocprofiler-sdk/context.h>
#include <rocprofiler-sdk/external_correlation.h>
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/hip/runtime_api_id.h>
#include <rocprofiler-sdk/internal_threading.h>
#include <rocprofiler-sdk/registration.h>
#include <rocprofiler-sdk/rocprofiler.h>
#include "common/call_stack.hpp"
#include "common/defines.hpp"
#include "common/filesystem.hpp"
#include "common/name_info.hpp"
#include <atomic>
#include <cassert>
#include <chrono>
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <functional>
#include <iomanip>
#include <iostream>
#include <map>
#include <mutex>
#include <shared_mutex>
#include <sstream>
#include <stdexcept>
#include <string>
#include <string_view>
#include <thread>
#include <unordered_set>
#include <vector>
namespace client
{
namespace
{
struct external_corr_id_data;
using common::buffer_name_info;
using common::call_stack_t;
using common::source_location;
using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t;
using kernel_symbol_map_t = std::unordered_map<rocprofiler_kernel_id_t, kernel_symbol_data_t>;
using external_corr_id_set_t = std::unordered_set<external_corr_id_data*>;
using retired_corr_id_set_t = std::unordered_set<uint64_t>;
rocprofiler_client_id_t* client_id = nullptr;
rocprofiler_client_finalize_t client_fini_func = nullptr;
rocprofiler_context_id_t client_ctx = {};
rocprofiler_buffer_id_t client_buffer = {};
buffer_name_info* client_name_info = new buffer_name_info{};
kernel_symbol_map_t* client_kernels = new kernel_symbol_map_t{};
auto client_mutex = std::shared_mutex{};
auto client_external_corr_ids = external_corr_id_set_t{};
auto client_retired_corr_ids = retired_corr_id_set_t{};
void
print_call_stack(const call_stack_t& _call_stack)
{
common::print_call_stack("external_correlation_id_request.log", _call_stack);
}
void
tool_code_object_callback(rocprofiler_callback_tracing_record_t record,
rocprofiler_user_data_t* user_data,
void* callback_data)
{
if(record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT &&
record.operation == ROCPROFILER_CODE_OBJECT_LOAD)
{
if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD)
{
// flush the buffer to ensure that any lookups for the client kernel names for the code
// object are completed
auto flush_status = rocprofiler_flush_buffer(client_buffer);
if(flush_status != ROCPROFILER_STATUS_ERROR_BUFFER_BUSY)
ROCPROFILER_CHECK(flush_status);
}
}
else if(record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT &&
record.operation == ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER)
{
auto* data = static_cast<kernel_symbol_data_t*>(record.payload);
if(record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD)
{
client_kernels->emplace(data->kernel_id, *data);
}
else if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD)
{
auto flush_status = rocprofiler_flush_buffer(client_buffer);
if(flush_status != ROCPROFILER_STATUS_ERROR_BUFFER_BUSY)
ROCPROFILER_CHECK(flush_status);
client_kernels->erase(data->kernel_id);
}
}
(void) user_data;
(void) callback_data;
}
struct external_corr_id_data
{
using request_kind_t = rocprofiler_external_correlation_id_request_kind_t;
static constexpr auto request_none = ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_NONE;
rocprofiler_thread_id_t thread_id = 0;
rocprofiler_context_id_t context_id = {.handle = 0};
request_kind_t kind = request_none;
rocprofiler_tracing_operation_t operation = 0;
uint64_t internal_corr_id = 0;
void* user_data = nullptr;
uint64_t seen_count = 0;
bool valid() const;
friend std::ostream& operator<<(std::ostream& os, external_corr_id_data data)
{
if(!data.valid()) return os;
auto ss = std::stringstream{};
ss << "seen=" << data.seen_count << ", thr_id=" << data.thread_id
<< ", context_id=" << data.context_id.handle << ", kind=" << data.kind
<< ", operation=" << data.operation << ", corr_id=" << data.internal_corr_id
<< ", user_data=" << data.user_data;
return (os << ss.str());
}
};
bool
operator==(external_corr_id_data lhs, external_corr_id_data rhs)
{
return std::tie(lhs.thread_id,
lhs.context_id.handle,
lhs.kind,
lhs.operation,
lhs.internal_corr_id,
lhs.user_data) == std::tie(rhs.thread_id,
rhs.context_id.handle,
rhs.kind,
rhs.operation,
rhs.internal_corr_id,
rhs.user_data);
}
bool
operator!=(external_corr_id_data lhs, external_corr_id_data rhs)
{
return !(lhs == rhs);
}
bool
external_corr_id_data::valid() const
{
static constexpr auto invalid_v = external_corr_id_data{};
return (*this != invalid_v);
}
int
set_external_correlation_id(rocprofiler_thread_id_t thr_id,
rocprofiler_context_id_t ctx_id,
rocprofiler_external_correlation_id_request_kind_t kind,
rocprofiler_tracing_operation_t op,
uint64_t internal_corr_id,
rocprofiler_user_data_t* external_corr_id,
void* user_data)
{
auto* _data =
new external_corr_id_data{thr_id, ctx_id, kind, op, internal_corr_id, user_data, 0};
{
static auto _mtx = std::mutex{};
auto _lk = std::unique_lock{_mtx};
client_external_corr_ids.emplace(_data);
}
external_corr_id->ptr = _data;
return 0;
}
void
tool_tracing_callback(rocprofiler_context_id_t context,
rocprofiler_buffer_id_t buffer_id,
rocprofiler_record_header_t** headers,
size_t num_headers,
void* user_data,
uint64_t /*drop_count*/)
{
static const auto ensure_internal_correlation_id_retirement_ordering = [](uint64_t _corr_id) {
auto _lk = std::shared_lock<std::shared_mutex>{client_mutex};
// this correlation ID should not have reported as retired yet so
// we are demoing the expectation here
if(client_retired_corr_ids.count(_corr_id) > 0)
{
auto msg = std::stringstream{};
msg << "internal correlation id " << _corr_id << " was retired prematurely";
throw std::runtime_error{msg.str()};
}
};
for(size_t i = 0; i < num_headers; ++i)
{
auto* header = headers[i];
auto kind_name = std::string{};
if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING)
{
const char* _name = nullptr;
auto _kind = static_cast<rocprofiler_buffer_tracing_kind_t>(header->kind);
ROCPROFILER_CHECK(rocprofiler_query_buffer_tracing_kind_name(_kind, &_name, nullptr));
if(_name)
{
static size_t len = 15;
kind_name = std::string{_name};
len = std::max(len, kind_name.length());
kind_name.resize(len, ' ');
kind_name += " :: ";
}
}
if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING &&
header->kind == ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API)
{
auto* record =
static_cast<rocprofiler_buffer_tracing_hip_api_record_t*>(header->payload);
// this should always be empty
auto _extern_corr_id = external_corr_id_data{};
// demonstrate reliability of correlation ID retirement ordering
ensure_internal_correlation_id_retirement_ordering(record->correlation_id.internal);
auto info = std::stringstream{};
info << "tid=" << record->thread_id << ", context=" << context.handle
<< ", buffer_id=" << buffer_id.handle
<< ", corr_id=" << record->correlation_id.internal << ", kind=" << record->kind
<< ", operation=" << record->operation
<< ", name=" << (*client_name_info)[record->kind][record->operation]
<< ", extern_corr_id={" << _extern_corr_id << "}";
static_cast<call_stack_t*>(user_data)->emplace_back(
source_location{__FUNCTION__, __FILE__, __LINE__, kind_name + info.str()});
}
else if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING &&
header->kind == ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH)
{
auto* record =
static_cast<rocprofiler_buffer_tracing_kernel_dispatch_record_t*>(header->payload);
// demonstrate reliability of correlation ID retirement ordering
ensure_internal_correlation_id_retirement_ordering(record->correlation_id.internal);
auto _extern_corr_id = external_corr_id_data{};
if(record->correlation_id.external.ptr)
{
auto* _extcid =
static_cast<external_corr_id_data*>(record->correlation_id.external.ptr);
_extcid->seen_count++;
_extern_corr_id = *_extcid;
// demonstrate reliability of correlation ID retirement ordering
ensure_internal_correlation_id_retirement_ordering(_extcid->internal_corr_id);
}
auto info = std::stringstream{};
info << "tid=" << record->thread_id << ", context=" << context.handle
<< ", buffer_id=" << buffer_id.handle
<< ", corr_id=" << record->correlation_id.internal << ", kind=" << record->kind
<< ", operation=" << record->operation
<< ", agent_id=" << record->dispatch_info.agent_id.handle
<< ", queue_id=" << record->dispatch_info.queue_id.handle
<< ", dispatch_id=" << record->dispatch_info.dispatch_id
<< ", kernel_id=" << record->dispatch_info.kernel_id
<< ", kernel=" << client_kernels->at(record->dispatch_info.kernel_id).kernel_name
<< ", extern_corr_id={" << _extern_corr_id << "}";
static_cast<call_stack_t*>(user_data)->emplace_back(
source_location{__FUNCTION__, __FILE__, __LINE__, kind_name + info.str()});
}
else if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING &&
header->kind == ROCPROFILER_BUFFER_TRACING_MEMORY_COPY)
{
auto* record =
static_cast<rocprofiler_buffer_tracing_memory_copy_record_t*>(header->payload);
// demonstrate reliability of correlation ID retirement ordering
ensure_internal_correlation_id_retirement_ordering(record->correlation_id.internal);
auto _extern_corr_id = external_corr_id_data{};
if(record->correlation_id.external.ptr)
{
auto* _extcid =
static_cast<external_corr_id_data*>(record->correlation_id.external.ptr);
_extcid->seen_count++;
_extern_corr_id = *_extcid;
// demonstrate reliability of correlation ID retirement ordering
ensure_internal_correlation_id_retirement_ordering(_extcid->internal_corr_id);
}
auto info = std::stringstream{};
info << "tid=" << record->thread_id << ", context=" << context.handle
<< ", buffer_id=" << buffer_id.handle
<< ", corr_id=" << record->correlation_id.internal << ", kind=" << record->kind
<< ", operation=" << record->operation
<< ", src_agent_id=" << record->src_agent_id.handle
<< ", dst_agent_id=" << record->dst_agent_id.handle
<< ", direction=" << record->operation << ", start=" << record->start_timestamp
<< ", stop=" << record->end_timestamp
<< ", name=" << client_name_info->at(record->kind, record->operation)
<< ", extern_corr_id={" << _extern_corr_id << "}";
static_cast<call_stack_t*>(user_data)->emplace_back(
source_location{__FUNCTION__, __FILE__, __LINE__, kind_name + info.str()});
}
else if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING &&
header->kind == ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT)
{
auto* record =
static_cast<rocprofiler_buffer_tracing_correlation_id_retirement_record_t*>(
header->payload);
{
auto _lk = std::unique_lock<std::shared_mutex>{client_mutex};
client_retired_corr_ids.emplace(record->internal_correlation_id);
}
auto _extern_corr_id = external_corr_id_data{};
auto info = std::stringstream{};
info << "context=" << context.handle << ", buffer_id=" << buffer_id.handle
<< ", corr_id=" << record->internal_correlation_id << ", kind=" << record->kind
<< ", timestamp=" << record->timestamp
<< ", name=" << client_name_info->at(record->kind) << ", extern_corr_id={"
<< _extern_corr_id << "}";
static_cast<call_stack_t*>(user_data)->emplace_back(
source_location{__FUNCTION__, __FILE__, __LINE__, kind_name + info.str()});
}
else
{
auto _msg = std::stringstream{};
_msg << "unexpected rocprofiler_record_header_t category + kind: (" << header->category
<< " + " << header->kind << ")";
throw std::runtime_error{_msg.str()};
}
}
}
template <typename Arg, typename... Args>
auto
make_array(Arg arg, Args&&... args)
{
constexpr auto N = 1 + sizeof...(Args);
return std::array<Arg, N>{std::forward<Arg>(arg), std::forward<Args>(args)...};
}
int
tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
{
assert(tool_data != nullptr);
auto* call_stack_v = static_cast<call_stack_t*>(tool_data);
call_stack_v->emplace_back(source_location{__FUNCTION__, __FILE__, __LINE__, ""});
*client_name_info = common::get_buffer_tracing_names();
client_fini_func = fini_func;
ROCPROFILER_CHECK(rocprofiler_create_context(&client_ctx));
auto code_object_ops = std::vector<rocprofiler_tracing_operation_t>{
ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER};
ROCPROFILER_CHECK(
rocprofiler_configure_callback_tracing_service(client_ctx,
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT,
code_object_ops.data(),
code_object_ops.size(),
tool_code_object_callback,
nullptr));
constexpr auto buffer_size_bytes = 4096;
constexpr auto buffer_watermark_bytes = buffer_size_bytes - (buffer_size_bytes / 8);
ROCPROFILER_CHECK(rocprofiler_create_buffer(client_ctx,
buffer_size_bytes,
buffer_watermark_bytes,
ROCPROFILER_BUFFER_POLICY_LOSSLESS,
tool_tracing_callback,
tool_data,
&client_buffer));
auto external_corr_id_request_kinds =
make_array(ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KERNEL_DISPATCH,
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_MEMORY_COPY);
ROCPROFILER_CHECK(rocprofiler_configure_external_correlation_id_request_service(
client_ctx,
external_corr_id_request_kinds.data(),
external_corr_id_request_kinds.size(),
set_external_correlation_id,
nullptr));
auto hip_runtime_ops = std::vector<rocprofiler_tracing_operation_t>{};
const auto desired_hip_runtime_ops = std::unordered_set<std::string_view>{
"hipLaunchKernel", "hipMemcpyAsync", "hipMemsetAsync", "hipMalloc"};
for(auto [idx, itr] : (*client_name_info)[ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API].items())
{
if(desired_hip_runtime_ops.count(*itr) > 0) hip_runtime_ops.emplace_back(idx);
}
if(desired_hip_runtime_ops.size() != hip_runtime_ops.size())
throw std::runtime_error{"missing hip operations"};
ROCPROFILER_CHECK(
rocprofiler_configure_buffer_tracing_service(client_ctx,
ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API,
hip_runtime_ops.data(),
hip_runtime_ops.size(),
client_buffer));
ROCPROFILER_CHECK(rocprofiler_configure_buffer_tracing_service(
client_ctx, ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH, nullptr, 0, client_buffer));
ROCPROFILER_CHECK(rocprofiler_configure_buffer_tracing_service(
client_ctx, ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, nullptr, 0, client_buffer));
ROCPROFILER_CHECK(rocprofiler_configure_buffer_tracing_service(
client_ctx,
ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT,
nullptr,
0,
client_buffer));
int valid_ctx = 0;
ROCPROFILER_CHECK(rocprofiler_context_is_valid(client_ctx, &valid_ctx));
if(valid_ctx == 0)
{
// notify rocprofiler that initialization failed
// and all the contexts, buffers, etc. created
// should be ignored
return -1;
}
ROCPROFILER_CHECK(rocprofiler_start_context(client_ctx));
// no errors
return 0;
}
void
tool_fini(void* tool_data)
{
assert(tool_data != nullptr);
client_fini_func = nullptr;
client_id = nullptr;
std::cout << "finalizing...\n" << std::flush;
rocprofiler_stop_context(client_ctx);
ROCPROFILER_CHECK(rocprofiler_flush_buffer(client_buffer));
auto* _call_stack = static_cast<call_stack_t*>(tool_data);
_call_stack->emplace_back(source_location{__FUNCTION__, __FILE__, __LINE__, ""});
print_call_stack(*_call_stack);
size_t unretired = 0;
size_t unseen = 0;
for(auto* itr : client_external_corr_ids)
{
if(itr->seen_count != 1)
{
std::cerr << "external correlation ID seen " << itr->seen_count << " times: {" << *itr
<< "}\n"
<< std::flush;
++unseen;
}
if(client_retired_corr_ids.count(itr->internal_corr_id) != 1)
{
std::cerr << "internal correlation ID passed to external correlation ID request was "
"not retired: {"
<< itr->internal_corr_id << "}\n"
<< std::flush;
++unretired;
}
delete itr;
}
std::cerr << "external correlation IDs not seen : " << unseen << "\n" << std::flush;
std::cerr << "internal correlation IDs not retired: " << unretired << "\n" << std::flush;
if(unseen > 0) throw std::runtime_error{"unseen external correlation id data"};
if(unretired > 0) throw std::runtime_error{"unretired internal correlation id values"};
delete _call_stack;
}
} // namespace
void
setup()
{
if(int status = 0;
rocprofiler_is_initialized(&status) == ROCPROFILER_STATUS_SUCCESS && status == 0)
{
ROCPROFILER_CHECK(rocprofiler_force_configure(&rocprofiler_configure));
}
}
void
shutdown()
{
if(client_id)
{
ROCPROFILER_CHECK(rocprofiler_flush_buffer(client_buffer));
client_fini_func(*client_id);
}
}
void
start()
{
ROCPROFILER_CHECK(rocprofiler_start_context(client_ctx));
}
void
identify(uint64_t val)
{
auto _tid = rocprofiler_thread_id_t{};
rocprofiler_get_thread_id(&_tid);
rocprofiler_user_data_t user_data = {};
user_data.value = val;
rocprofiler_push_external_correlation_id(client_ctx, _tid, user_data);
}
void
stop()
{
ROCPROFILER_CHECK(rocprofiler_stop_context(client_ctx));
}
} // namespace client
extern "C" rocprofiler_tool_configure_result_t*
rocprofiler_configure(uint32_t version,
const char* runtime_version,
uint32_t priority,
rocprofiler_client_id_t* id)
{
// set the client name
id->name = "ExampleTool";
// store client info
client::client_id = id;
// compute major/minor/patch version info
uint32_t major = version / 10000;
uint32_t minor = (version % 10000) / 100;
uint32_t patch = version % 100;
// generate info string
auto info = std::stringstream{};
info << id->name << " (priority=" << priority << ") is using rocprofiler-sdk v" << major << "."
<< minor << "." << patch << " (" << runtime_version << ")";
std::clog << info.str() << std::endl;
auto* client_tool_data = new std::vector<client::source_location>{};
client_tool_data->emplace_back(
client::source_location{__FUNCTION__, __FILE__, __LINE__, info.str()});
std::atexit([]() {
std::cout << "atexit handler...\n" << std::flush;
if(client::client_fini_func && client::client_id)
client::client_fini_func(*client::client_id);
});
// create configure data
static auto cfg =
rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t),
&client::tool_init,
&client::tool_fini,
static_cast<void*>(client_tool_data)};
// return pointer to configure data
return &cfg;
}
@@ -0,0 +1,49 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#ifdef buffered_api_tracing_client_EXPORTS
# define CLIENT_API __attribute__((visibility("default")))
#else
# define CLIENT_API
#endif
#include <cstdint>
namespace client
{
void
setup() CLIENT_API;
void
shutdown() CLIENT_API;
void
start() CLIENT_API;
void
stop() CLIENT_API;
void
identify(uint64_t corr_id) CLIENT_API;
} // namespace client
@@ -0,0 +1,412 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include "hip/hip_runtime.h"
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <mutex>
#include <random>
#include <sstream>
#include <stdexcept>
#define HIP_API_CALL(CALL) \
{ \
hipError_t error_ = (CALL); \
if(error_ != hipSuccess) \
{ \
auto _hip_api_print_lk = auto_lock_t{print_lock}; \
fprintf(stderr, \
"%s:%d :: HIP error : %s\n", \
__FILE__, \
__LINE__, \
hipGetErrorString(error_)); \
throw std::runtime_error("hip_api_call"); \
} \
}
namespace
{
using auto_lock_t = std::unique_lock<std::mutex>;
auto print_lock = std::mutex{};
size_t nthread_per_device = 2;
size_t nitr = 500;
size_t nsync = 10;
constexpr unsigned shared_mem_tile_dim = 32;
void
check_hip_error(void);
void
verify(int* in, int* out, int M, int N);
} // namespace
__global__ void
transpose(const int* in, int* out, int M, int N);
void
run(int rank, int tid, int devid, int argc, char** argv);
void
run_transpose(int rank, int tid, hipStream_t stream, int argc, char** argv);
void
run_migrate(int rank, int tid, hipStream_t stream, int, char** argv);
void
run_scratch(int rank, int tid, hipStream_t stream, int argc, char** argv);
int
main(int argc, char** argv)
{
auto* exe_name = ::basename(argv[0]);
int rank = 0;
for(int i = 1; i < argc; ++i)
{
auto _arg = std::string{argv[i]};
if(_arg == "?" || _arg == "-h" || _arg == "--help")
{
fprintf(stderr,
"usage: %s [NUM_THREADS_PER_DEVICE (%zu)] [NUM_ITERATION (%zu)] "
"[SYNC_EVERY_N_ITERATIONS (%zu)]\n",
exe_name,
nthread_per_device,
nitr,
nsync);
exit(EXIT_SUCCESS);
}
}
if(argc > 1) nthread_per_device = atoll(argv[1]);
if(argc > 2) nitr = atoll(argv[2]);
if(argc > 3) nsync = atoll(argv[3]);
int ndevice = 0;
HIP_API_CALL(hipGetDeviceCount(&ndevice));
auto nthreads = (ndevice * nthread_per_device);
printf("[%s] Number of devices found: %i\n", exe_name, ndevice);
printf("[%s] Number of threads (per device): %zu\n", exe_name, nthread_per_device);
printf("[%s] Number of threads (total): %zu\n", exe_name, nthreads);
printf("[%s] Number of iterations: %zu\n", exe_name, nitr);
printf("[%s] Syncing every %zu iterations\n", exe_name, nsync);
{
auto _threads = std::vector<std::thread>{};
for(size_t i = 0; i < nthreads; ++i)
_threads.emplace_back(run, rank, i, i % ndevice, argc, argv);
for(auto& itr : _threads)
itr.join();
}
HIP_API_CALL(hipDeviceSynchronize());
HIP_API_CALL(hipDeviceReset());
return 0;
}
__global__ void
transpose(const int* in, int* out, int M, int N)
{
__shared__ int tile[shared_mem_tile_dim][shared_mem_tile_dim];
int idx = (blockIdx.y * blockDim.y + threadIdx.y) * M + blockIdx.x * blockDim.x + threadIdx.x;
tile[threadIdx.y][threadIdx.x] = in[idx];
__syncthreads();
idx = (blockIdx.x * blockDim.x + threadIdx.y) * N + blockIdx.y * blockDim.y + threadIdx.x;
out[idx] = tile[threadIdx.x][threadIdx.y];
}
template <typename Tp>
__global__ void
test_page_migrate(Tp* data, Tp val)
{
int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
data[idx] += val;
}
__global__ void
test_kern_large(uint64_t* output)
{
uint64_t result = 0;
int test[4000];
memset(test, 5, 4000);
for(int& i : test)
{
i = i + 7;
*output += i;
result += i;
}
*output ^= result;
*output ^= result;
}
__global__ void
test_kern_medium(uint64_t* output)
{
uint64_t result = 0;
int test[175];
memset(test, 5, 175);
for(int& i : test)
{
i = i + 7;
*output += i;
result += i;
}
*output ^= result;
*output ^= result;
}
__global__ void
test_kern_small(uint64_t* output)
{
uint64_t result = 0;
int test[2];
for(int& i : test)
{
i = i + 7;
*output += i;
result += i;
}
*output ^= result;
*output ^= result;
}
void
run(int rank, int tid, int devid, int argc, char** argv)
{
auto* stream = hipStream_t{};
HIP_API_CALL(hipSetDevice(devid));
HIP_API_CALL(hipStreamCreate(&stream));
run_migrate(rank, tid, stream, argc, argv);
run_scratch(rank, tid, stream, argc, argv);
run_transpose(rank, tid, stream, argc, argv);
HIP_API_CALL(hipStreamSynchronize(stream));
HIP_API_CALL(hipStreamDestroy(stream));
}
void
run_transpose(int rank, int tid, hipStream_t stream, int argc, char** argv)
{
auto* exe_name = ::basename(argv[0]);
unsigned int M = 4960 * 2;
unsigned int N = 4960 * 2;
if(argc > 2) nitr = atoll(argv[2]);
if(argc > 3) nsync = atoll(argv[3]);
auto_lock_t _lk{print_lock};
std::cout << "[" << exe_name << "][transpose][" << rank << "][" << tid << "] M: " << M
<< " N: " << N << std::endl;
_lk.unlock();
std::default_random_engine _engine{std::random_device{}() * (rank + 1) * (tid + 1)};
std::uniform_int_distribution<int> _dist{0, 1000};
size_t size = sizeof(int) * M * N;
int* inp_matrix = new int[size];
int* out_matrix = new int[size];
for(size_t i = 0; i < M * N; i++)
{
inp_matrix[i] = _dist(_engine);
out_matrix[i] = 0;
}
int* in = nullptr;
int* out = nullptr;
HIP_API_CALL(hipMalloc(&in, size));
HIP_API_CALL(hipMalloc(&out, size));
HIP_API_CALL(hipMemsetAsync(in, 0, size, stream));
HIP_API_CALL(hipMemsetAsync(out, 0, size, stream));
HIP_API_CALL(hipMemcpyAsync(in, inp_matrix, size, hipMemcpyHostToDevice, stream));
HIP_API_CALL(hipStreamSynchronize(stream));
dim3 grid(M / 32, N / 32, 1);
dim3 block(32, 32, 1); // transpose
print_lock.lock();
printf("[%s][transpose][%i][%i] grid=(%i,%i,%i), block=(%i,%i,%i)\n",
exe_name,
rank,
tid,
grid.x,
grid.y,
grid.z,
block.x,
block.y,
block.z);
print_lock.unlock();
auto t1 = std::chrono::high_resolution_clock::now();
for(size_t i = 0; i < nitr; ++i)
{
transpose<<<grid, block, 0, stream>>>(in, out, M, N);
check_hip_error();
if(i % nsync == (nsync - 1)) HIP_API_CALL(hipStreamSynchronize(stream));
}
auto t2 = std::chrono::high_resolution_clock::now();
HIP_API_CALL(hipStreamSynchronize(stream));
HIP_API_CALL(hipMemcpyAsync(out_matrix, out, size, hipMemcpyDeviceToHost, stream));
double time = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count();
float GB = (float) size * nitr * 2 / (1 << 30);
print_lock.lock();
std::cout << "[" << exe_name << "][transpose][" << rank << "][" << tid
<< "] Runtime of transpose is " << time << " sec\n";
std::cout << "[" << exe_name << "][transpose][" << rank << "][" << tid
<< "] The average performance of transpose is " << GB / time << " GBytes/sec"
<< std::endl;
print_lock.unlock();
HIP_API_CALL(hipStreamSynchronize(stream));
// cpu_transpose(matrix, out_matrix, M, N);
verify(inp_matrix, out_matrix, M, N);
HIP_API_CALL(hipFree(in));
HIP_API_CALL(hipFree(out));
delete[] inp_matrix;
delete[] out_matrix;
}
void
run_scratch(int rank, int tid, hipStream_t stream, int, char** argv)
{
auto t1 = std::chrono::high_resolution_clock::now();
HIP_API_CALL(hipStreamSynchronize(stream));
const auto* exe_name = ::basename(argv[0]);
uint64_t* data_ptr = nullptr;
HIP_API_CALL(hipHostMalloc(&data_ptr, sizeof(uint64_t), 0));
*data_ptr = 0;
test_kern_small<<<1000, 1, 0, stream>>>(data_ptr);
test_kern_medium<<<1000, 1, 0, stream>>>(data_ptr);
test_kern_small<<<1000, 1, 0, stream>>>(data_ptr);
test_kern_large<<<1100, 1, 0, stream>>>(data_ptr);
HIP_API_CALL(hipStreamSynchronize(stream));
test_kern_small<<<1000, 1, 0, stream>>>(data_ptr);
HIP_API_CALL(hipStreamSynchronize(stream));
test_kern_medium<<<1000, 1, 0, stream>>>(data_ptr);
HIP_API_CALL(hipStreamSynchronize(stream));
test_kern_small<<<1000, 1, 0, stream>>>(data_ptr);
HIP_API_CALL(hipStreamSynchronize(stream));
test_kern_large<<<1100, 1, 0, stream>>>(data_ptr);
HIP_API_CALL(hipStreamSynchronize(stream));
auto t2 = std::chrono::high_resolution_clock::now();
double time = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count();
print_lock.lock();
std::cout << "[" << exe_name << "][scratch][" << rank << "][" << tid
<< "] Runtime of scratch is " << time << " sec\n";
print_lock.unlock();
}
void
run_migrate(int rank, int tid, hipStream_t stream, int, char** argv)
{
using data_type = uint64_t;
constexpr data_type init_v = 1;
constexpr data_type incr_v = 1;
auto t1 = std::chrono::high_resolution_clock::now();
HIP_API_CALL(hipStreamSynchronize(stream));
const auto* exe_name = ::basename(argv[0]);
auto page_data = std::vector<data_type>(1024, 0);
HIP_API_CALL(hipHostRegister(
page_data.data(), page_data.size() * sizeof(data_type), hipHostRegisterDefault));
for(auto& itr : page_data)
itr = init_v;
test_page_migrate<<<1, 1024, 0, stream>>>(page_data.data(), incr_v);
HIP_API_CALL(hipStreamSynchronize(stream));
for(auto& itr : page_data)
{
auto diff = (itr - incr_v);
if(diff != init_v)
{
auto msg = std::stringstream{};
msg << "invalid diff: " << diff << ". expected: " << init_v;
throw std::runtime_error{msg.str()};
}
}
HIP_API_CALL(hipHostUnregister(page_data.data()));
auto t2 = std::chrono::high_resolution_clock::now();
double time = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count();
print_lock.lock();
std::cout << "[" << exe_name << "][migrate][" << rank << "][" << tid
<< "] Runtime of migrate is " << time << " sec\n";
print_lock.unlock();
}
namespace
{
void
check_hip_error(void)
{
hipError_t err = hipGetLastError();
if(err != hipSuccess)
{
auto_lock_t _lk{print_lock};
std::cerr << "Error: " << hipGetErrorString(err) << std::endl;
throw std::runtime_error("hip_api_call");
}
}
void
verify(int* in, int* out, int M, int N)
{
for(int i = 0; i < 10; i++)
{
int row = rand() % M;
int col = rand() % N;
if(in[row * N + col] != out[col * M + row])
{
auto_lock_t _lk{print_lock};
std::cout << "mismatch: " << row << ", " << col << " : " << in[row * N + col] << " | "
<< out[col * M + row] << "\n";
}
}
}
} // namespace
@@ -266,9 +266,6 @@ rocprofiler_configure(uint32_t version,
uint32_t priority,
rocprofiler_client_id_t* id)
{
// only activate if main tool
if(priority > 0) return nullptr;
// set the client name
id->name = "ExampleTool";
@@ -282,8 +279,8 @@ rocprofiler_configure(uint32_t version,
// generate info string
auto info = std::stringstream{};
info << id->name << " is using rocprofiler-sdk v" << major << "." << minor << "." << patch
<< " (" << runtime_version << ")";
info << id->name << " (priority=" << priority << ") is using rocprofiler-sdk v" << major << "."
<< minor << "." << patch << " (" << runtime_version << ")";
std::clog << info.str() << std::endl;
@@ -229,9 +229,6 @@ rocprofiler_configure(uint32_t version,
uint32_t priority,
rocprofiler_client_id_t* client_id)
{
// If not first tool to register, indicate that the tool doesn't want to do anything
if(priority > 0) return nullptr;
// (optional) Provide a name for this tool to rocprofiler
client_id->name = "ExampleTool";
@@ -41,3 +41,4 @@ install(
add_subdirectory(hip)
add_subdirectory(hsa)
add_subdirectory(marker)
add_subdirectory(cxx)
@@ -0,0 +1,13 @@
#
#
# Installation of public C++ headers
#
#
set(ROCPROFILER_CXX_HEADER_FILES name_info.hpp serialization.hpp)
install(
FILES ${ROCPROFILER_CXX_HEADER_FILES}
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rocprofiler-sdk/cxx
COMPONENT development)
add_subdirectory(details)
@@ -0,0 +1,11 @@
#
#
# Installation of public C++ headers (implementations)
#
#
set(ROCPROFILER_CXX_DETAILS_HEADER_FILES mpl.hpp name_info.hpp)
install(
FILES ${ROCPROFILER_CXX_DETAILS_HEADER_FILES}
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rocprofiler-sdk/cxx/details
COMPONENT development)
@@ -0,0 +1,84 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//
#pragma once
#include <string>
#include <string_view>
#include <type_traits>
#include <utility>
namespace rocprofiler
{
namespace sdk
{
namespace mpl
{
template <typename Tp>
struct string_support
{
using type = Tp;
using return_type = void;
static constexpr auto value = false;
static constexpr void default_value() {}
};
template <>
struct string_support<const char*>
{
using type = const char*;
using return_type = type;
static constexpr auto value = true;
static constexpr type default_value() { return nullptr; }
type operator()(const char* val) const { return val; }
};
template <>
struct string_support<std::string_view>
{
using type = std::string_view;
using return_type = type;
static constexpr auto value = true;
static constexpr type default_value() { return type{}; }
type operator()(const char* val) const { return type{val}; }
};
template <>
struct string_support<std::string>
{
using type = std::string;
using return_type = type;
static constexpr auto value = true;
static type default_value() { return type{}; }
type operator()(const char* val) const { return type{val}; }
};
} // namespace mpl
} // namespace sdk
} // namespace rocprofiler
@@ -0,0 +1,180 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//
#pragma once
#if !defined(ROCPROFILER_SDK_CXX_NAME_INFO_HPP_)
# include <rocprofiler-sdk/cxx/name_info.hpp>
#endif
#include <rocprofiler-sdk/buffer_tracing.h>
#include <rocprofiler-sdk/callback_tracing.h>
#include <rocprofiler-sdk/fwd.h>
#include <string_view>
#include <type_traits>
#include <vector>
namespace rocprofiler
{
namespace sdk
{
namespace utility
{
template <typename EnumT, typename ValueT>
typename name_info_impl<EnumT, ValueT>::item_array_t
name_info_impl<EnumT, ValueT>::items() const
{
auto ret = item_array_t{};
ret.reserve(operations.size());
rocprofiler_tracing_operation_t _idx = 0;
for(const auto& itr : operations)
ret.emplace_back(_idx++, &itr);
return ret;
}
template <typename EnumT, typename ValueT>
inline void
name_info<EnumT, ValueT>::emplace(EnumT idx, const char* name)
{
impl.resize(idx + 1, value_type{});
impl.at(idx).value = idx;
impl.at(idx).name = support_type{}(name);
}
template <typename EnumT, typename ValueT>
inline void
name_info<EnumT, ValueT>::emplace(EnumT idx,
rocprofiler_tracing_operation_t opidx,
const char* name)
{
impl.resize(idx + 1, value_type{});
impl.at(idx).operations.resize(opidx + 1, support_type::default_value());
impl.at(idx).operations.at(opidx) = support_type{}(name);
}
template <typename EnumT, typename ValueT>
typename name_info<EnumT, ValueT>::return_type
name_info<EnumT, ValueT>::at(EnumT idx) const
{
return impl.at(idx).name;
}
template <typename EnumT, typename ValueT>
typename name_info<EnumT, ValueT>::return_type
name_info<EnumT, ValueT>::at(EnumT idx, rocprofiler_tracing_operation_t opidx) const
{
return impl.at(idx).operations.at(opidx);
}
template <typename EnumT, typename ValueT>
typename name_info<EnumT, ValueT>::item_array_t
name_info<EnumT, ValueT>::items() const
{
auto ret = item_array_t{};
ret.reserve(impl.size());
for(const auto& itr : impl)
ret.emplace_back(&itr);
return ret;
}
} // namespace utility
constexpr auto success_v = ROCPROFILER_STATUS_SUCCESS;
template <typename Tp>
inline callback_name_info_t<Tp>
get_callback_tracing_names()
{
auto cb_name_info = callback_name_info_t<Tp>{};
//
// callback for each kind operation
//
static auto tracing_kind_operation_cb =
[](rocprofiler_callback_tracing_kind_t kindv, uint32_t operation, void* data_v) {
auto* name_info_v = static_cast<callback_name_info_t<Tp>*>(data_v);
const char* name = nullptr;
auto status = rocprofiler_query_callback_tracing_kind_operation_name(
kindv, operation, &name, nullptr);
if(status == success_v && name) name_info_v->emplace(kindv, operation, name);
return 0;
};
//
// callback for each buffer kind (i.e. domain)
//
static auto tracing_kind_cb = [](rocprofiler_callback_tracing_kind_t kind, void* data) {
// store the buffer kind name
auto* name_info_v = static_cast<callback_name_info_t<Tp>*>(data);
const char* name = nullptr;
auto status = rocprofiler_query_callback_tracing_kind_name(kind, &name, nullptr);
if(status == success_v && name) name_info_v->emplace(kind, name);
rocprofiler_iterate_callback_tracing_kind_operations(kind, tracing_kind_operation_cb, data);
return 0;
};
rocprofiler_iterate_callback_tracing_kinds(tracing_kind_cb, &cb_name_info);
return cb_name_info;
}
template <typename Tp>
inline buffer_name_info_t<Tp>
get_buffer_tracing_names()
{
auto cb_name_info = buffer_name_info_t<Tp>{};
//
// callback for each kind operation
//
static auto tracing_kind_operation_cb =
[](rocprofiler_buffer_tracing_kind_t kindv, uint32_t operation, void* data_v) {
auto* name_info_v = static_cast<buffer_name_info_t<Tp>*>(data_v);
const char* name = nullptr;
auto status = rocprofiler_query_buffer_tracing_kind_operation_name(
kindv, operation, &name, nullptr);
if(status == success_v && name) name_info_v->emplace(kindv, operation, name);
return 0;
};
//
// callback for each buffer kind (i.e. domain)
//
static auto tracing_kind_cb = [](rocprofiler_buffer_tracing_kind_t kind, void* data) {
// store the buffer kind name
auto* name_info_v = static_cast<buffer_name_info_t<Tp>*>(data);
const char* name = nullptr;
auto status = rocprofiler_query_buffer_tracing_kind_name(kind, &name, nullptr);
if(status == success_v && name) name_info_v->emplace(kind, name);
rocprofiler_iterate_buffer_tracing_kind_operations(kind, tracing_kind_operation_cb, data);
return 0;
};
rocprofiler_iterate_buffer_tracing_kinds(tracing_kind_cb, &cb_name_info);
return cb_name_info;
}
} // namespace sdk
} // namespace rocprofiler
@@ -0,0 +1,116 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//
#pragma once
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/cxx/details/mpl.hpp>
#include <string>
#include <string_view>
#include <type_traits>
#include <vector>
namespace rocprofiler
{
namespace sdk
{
namespace utility
{
template <typename EnumT, typename ValueT = std::string_view>
struct name_info_impl
{
using support_type = mpl::string_support<ValueT>;
using enum_type = EnumT;
using value_type = ValueT;
using return_type = typename support_type::return_type;
using item_type = std::pair<rocprofiler_tracing_operation_t, const value_type*>;
using item_array_t = std::vector<item_type>;
static_assert(support_type::value,
"value_type must be supported by rocprofiler::sdk::mpl::string_support");
return_type operator()() const { return name; }
return_type operator()(size_t idx) const { return operations.at(idx); }
return_type operator[](size_t idx) const { return operations.at(idx); }
item_array_t items() const;
EnumT value = static_cast<EnumT>(0);
value_type name = {};
std::vector<value_type> operations = {};
};
template <typename EnumT, typename ValueT = std::string_view>
struct name_info
{
using value_type = name_info_impl<EnumT, ValueT>;
using enum_type = EnumT;
using support_type = typename value_type::support_type;
using return_type = typename value_type::return_type;
using item_type = const value_type*;
using item_array_t = std::vector<item_type>;
void emplace(EnumT idx, const char* name);
void emplace(EnumT idx, rocprofiler_tracing_operation_t opidx, const char* name);
return_type at(EnumT idx) const;
return_type at(EnumT idx, rocprofiler_tracing_operation_t opidx) const;
item_array_t items() const;
decltype(auto) size() const { return impl.size(); }
decltype(auto) begin() { return impl.begin(); }
decltype(auto) begin() const { return impl.begin(); }
decltype(auto) end() { return impl.end(); }
decltype(auto) end() const { return impl.end(); }
value_type& operator[](size_t idx) { return impl.at(idx); }
const value_type& operator[](size_t idx) const { return impl.at(idx); }
private:
std::vector<value_type> impl = {};
};
} // namespace utility
template <typename Tp = std::string_view>
using callback_name_info_t = utility::name_info<rocprofiler_callback_tracing_kind_t, Tp>;
template <typename Tp = std::string_view>
using buffer_name_info_t = utility::name_info<rocprofiler_buffer_tracing_kind_t, Tp>;
using callback_name_info = callback_name_info_t<std::string_view>;
using buffer_name_info = buffer_name_info_t<std::string_view>;
template <typename Tp = std::string_view>
callback_name_info_t<Tp>
get_callback_tracing_names();
template <typename Tp = std::string_view>
buffer_name_info_t<Tp>
get_buffer_tracing_names();
} // namespace sdk
} // namespace rocprofiler
#define ROCPROFILER_SDK_CXX_NAME_INFO_HPP_ 1
#include <rocprofiler-sdk/cxx/details/name_info.hpp>
@@ -0,0 +1,58 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//
#pragma once
#include <rocprofiler-sdk/cxx/name_info.hpp>
#include <cereal/cereal.hpp>
#include <string>
#include <string_view>
#include <vector>
namespace cereal
{
template <typename ArchiveT, typename EnumT, typename ValueT>
void
save(ArchiveT& ar, const rocprofiler::sdk::utility::name_info<EnumT, ValueT>& data)
{
ar.makeArray();
for(const auto& itr : data)
ar(cereal::make_nvp("entry", itr));
}
template <typename ArchiveT, typename EnumT, typename ValueT>
void
save(ArchiveT& ar, const rocprofiler::sdk::utility::name_info_impl<EnumT, ValueT>& data)
{
auto _name = std::string{data.name};
auto _ops = std::vector<std::string>{};
_ops.reserve(data.operations.size());
ar(cereal::make_nvp("kind", _name));
for(auto itr : data.operations)
_ops.emplace_back(itr);
ar(cereal::make_nvp("operations", _ops));
}
} // namespace cereal
@@ -204,7 +204,8 @@ rocprofiler_is_finalized(int* status) ROCPROFILER_API ROCPROFILER_NONNULL(1);
* uint32_t patch = version % 100;
*
* // print info
* printf("Configuring rocprofiler (v%u.%u.%u) [%s]\n", major, minor, patch, runtime_version);
* printf("Configuring %s with rocprofiler-sdk (v%u.%u.%u) [%s]\n",
* client_id->name, major, minor, patch, runtime_version);
*
* // create configure data
* static auto cfg = rocprofiler_tool_configure_result_t{ &my_init_func,
@@ -21,6 +21,7 @@
THE SOFTWARE. */
#include "lib/rocprofiler-sdk/hsa/queue.hpp"
#include "lib/common/scope_destructor.hpp"
#include "lib/common/utility.hpp"
#include "lib/rocprofiler-sdk/agent.hpp"
#include "lib/rocprofiler-sdk/buffer.hpp"
@@ -156,8 +157,8 @@ AsyncSignalHandler(hsa_signal_value_t /*signal_v*/, void* data)
LOG_IF(FATAL, _corr_id->get_ref_count() == 0)
<< "reference counter for correlation id " << _corr_id->internal << " from thread "
<< _corr_id->thread_idx << " has no reference count";
_corr_id->sub_ref_count();
_corr_id->sub_kern_count();
_corr_id->sub_ref_count();
}
queue_info_session.queue.async_complete();
@@ -239,18 +240,6 @@ WriteInterceptor(const void* packets,
ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH,
tracing_data_v);
auto* corr_id = context::get_latest_correlation_id();
auto thr_id = (corr_id) ? corr_id->thread_idx : common::get_tid();
auto user_data = rocprofiler_user_data_t{.value = 0};
auto internal_corr_id = (corr_id) ? corr_id->internal : 0;
tracing::populate_external_correlation_ids(
tracing_data_v.external_correlation_ids,
thr_id,
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KERNEL_DISPATCH,
ROCPROFILER_KERNEL_DISPATCH_ENQUEUE,
internal_corr_id);
const auto* packets_arr = static_cast<const rocprofiler_packet*>(packets);
auto transformed_packets = std::vector<rocprofiler_packet>{};
@@ -267,6 +256,41 @@ WriteInterceptor(const void* packets,
continue;
}
auto* corr_id = context::get_latest_correlation_id();
context::correlation_id* _corr_id_pop = nullptr;
if(!corr_id)
{
constexpr auto ref_count = 1;
corr_id = context::correlation_tracing_service::construct(ref_count);
_corr_id_pop = corr_id;
}
// increase the reference count to denote that this correlation id is being used in a kernel
corr_id->add_ref_count();
corr_id->add_kern_count();
auto thr_id = (corr_id) ? corr_id->thread_idx : common::get_tid();
auto user_data = rocprofiler_user_data_t{.value = 0};
auto internal_corr_id = (corr_id) ? corr_id->internal : 0;
// if we constructed a correlation id, this decrements the reference count after the
// underlying function returns
auto _corr_id_dtor = common::scope_destructor{[_corr_id_pop]() {
if(_corr_id_pop)
{
context::pop_latest_correlation_id(_corr_id_pop);
_corr_id_pop->sub_ref_count();
}
}};
tracing::populate_external_correlation_ids(
tracing_data_v.external_correlation_ids,
thr_id,
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KERNEL_DISPATCH,
ROCPROFILER_KERNEL_DISPATCH_ENQUEUE,
internal_corr_id);
queue.async_started();
// Copy kernel pkt, copy is to allow for signal to be modified
rocprofiler_packet kernel_pkt = packets_arr[i];
@@ -274,13 +298,6 @@ WriteInterceptor(const void* packets,
queue.create_signal(HSA_AMD_SIGNAL_AMD_GPU_ONLY,
&kernel_pkt.ext_amd_aql_pm4.completion_signal);
// increase the reference count to denote that this correlation id is being used in a kernel
if(corr_id)
{
corr_id->add_ref_count();
corr_id->add_kern_count();
}
// computes the "size" based on the offset of reserved_padding field
constexpr auto kernel_dispatch_info_rt_size =
common::compute_runtime_sizeof<rocprofiler_kernel_dispatch_info_t>();
@@ -69,7 +69,7 @@
struct page_migration_info<ROCPROFILER_PAGE_MIGRATION_##TYPE> \
{ \
static constexpr auto operation_idx = ROCPROFILER_PAGE_MIGRATION_##TYPE; \
static constexpr auto name = #TYPE; \
static constexpr auto name = "PAGE_MIGRATION_" #TYPE; \
static constexpr size_t uvm_bitmask = \
bitmask(std::index_sequence<GET_UVM_ENUMS(__VA_ARGS__)>()); \
static constexpr size_t kfd_bitmask = \
@@ -26,13 +26,14 @@ set(c-tool-env
set_tests_properties(
test-c-tool-execute
PROPERTIES TIMEOUT
45
LABELS
"integration-tests"
ENVIRONMENT
"${c-tool-env}"
PASS_REGULAR_EXPRESSION
"Test C tool is using rocprofiler-sdk v([0-9]+\\.[0-9]+\\.[0-9]+)"
FAIL_REGULAR_EXPRESSION
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
PROPERTIES
TIMEOUT
45
LABELS
"integration-tests"
ENVIRONMENT
"${c-tool-env}"
PASS_REGULAR_EXPRESSION
"Test C tool \\(priority=0\\) is using rocprofiler-sdk v([0-9]+\\.[0-9]+\\.[0-9]+)"
FAIL_REGULAR_EXPRESSION
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
@@ -0,0 +1,27 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//
#pragma once
#include <rocprofiler-sdk/cxx/name_info.hpp>
#include <rocprofiler-sdk/cxx/serialization.hpp>
@@ -21,17 +21,17 @@ def to_dict(key_values):
return a
def op_name(op_name, record):
found_op = False
op_key = None
def get_operation(record, kind_name, op_name=None):
for idx, itr in enumerate(record["names"]):
if kind_name == itr["kind"]:
if op_name is None:
return idx, itr["operations"]
else:
for oidx, oname in enumerate(itr["operations"]):
if op_name == oname:
return oidx
for kind_node in record["names"]["kind_names"]:
if kind_node["value"] == op_name:
op_key = kind_node["key"]
for op_node in record["names"]["operation_names"]:
if op_node["key"] == op_key:
return op_key, to_dict(op_node["value"])
return None
def dict_from_value_key(d):
@@ -259,18 +259,17 @@ def test_retired_correlation_ids(input_data):
def get_allocated_pages(callback_records):
# Get how many pages we allocated
hip_api_traces = callback_records["hip_api_traces"]
_, op_dict = op_name("HIP_RUNTIME_API", callback_records)
op_key = [k for k, v in op_dict.items() if v == "hipHostRegister"][0]
op_idx = get_operation(callback_records, "HIP_RUNTIME_API", "hipHostRegister")
rt_idx, rt_data = get_operation(callback_records, "HIP_RUNTIME_API")
assert op_idx is not None, f"{rt_idx}:\n{rt_data}"
host_register_record = []
for r in hip_api_traces:
if (
r["operation"] == op_key
and "sizeBytes" in r["args"]
and "hostPtr" in r["args"]
):
host_register_record.append(r)
for itr in callback_records["hip_api_traces"]:
if itr["kind"] == rt_idx and itr["operation"] == op_idx and itr["phase"] == 2:
assert "sizeBytes" in itr["args"].keys(), f"{itr}"
assert "hostPtr" in itr["args"].keys(), f"{itr}"
host_register_record.append(itr)
assert len(host_register_record) == 1
alloc_size = int(host_register_record[0]["args"]["sizeBytes"], 10)
@@ -285,11 +284,11 @@ def test_page_migration_data(input_data):
sdk_data = data["rocprofiler-sdk-json-tool"]
buffer_records = sdk_data["buffer_records"]
callback_records = sdk_data["callback_records"]
page_migtation_buffers = buffer_records["page_migration"]
page_migration_buffers = buffer_records["page_migration"]
bf_op_id, bf_op_names = op_name("PAGE_MIGRATION", buffer_records)
assert bf_op_names[0] == "NONE"
assert "PAGE_MIGRATE" in str(bf_op_names)
_, bf_op_names = get_operation(buffer_records, "PAGE_MIGRATION")
assert bf_op_names[0] == "PAGE_MIGRATION_NONE"
assert "PAGE_MIGRATION_PAGE_MIGRATE" in bf_op_names
assert len(bf_op_names) == 5
node_ids = set(x["gpu_id"] for x in sdk_data["agents"])
@@ -299,15 +298,14 @@ def test_page_migration_data(input_data):
assert int(alloc_size) == 16 * 4096 # We allocated 16 pages in the test
# PID must be same
assert len(set(r["pid"] for r in page_migtation_buffers)) == 1
assert len(set(r["pid"] for r in page_migration_buffers)) == 1
for r in page_migtation_buffers:
for r in page_migration_buffers:
op = r["operation"]
assert r["size"] == 136
assert r["kind"] == bf_op_id
assert op != 0 and bf_op_names[op] != "NONE"
assert bf_op_names[op].lower() in r
assert op != 0 and bf_op_names[op] != "PAGE_MIGRATION_NONE"
assert bf_op_names[op].lower().replace("page_migration_", "") in r.keys()
if "page_migrate" in r:
assert r["page_migrate"]["from_node"] in node_ids
@@ -328,7 +326,7 @@ def test_page_migration_data(input_data):
assert 0 < r["start_timestamp"] < r["end_timestamp"]
# Check for events with our page
for r in page_migtation_buffers:
for r in page_migration_buffers:
if "page_migrate" in r and r["page_migrate"]["start_addr"] == start_addr:
assert end_addr == r["page_migrate"]["end_addr"]
@@ -35,6 +35,8 @@ class dotdict(dict):
for k, v in self.items():
if isinstance(v, dict):
self.__setitem__(k, dotdict(v))
# print(k)
elif isinstance(v, list):
self.__setitem__(k, [dotdict(i) for i in v])
elif isinstance(v, (list, tuple)):
self.__setitem__(
k,
[dotdict(i) if isinstance(i, (list, tuple, dict)) else i for i in v],
)
@@ -152,16 +152,15 @@ def test_external_correlation_ids(input_data):
assert itr["correlation_id"]["external"] in extern_corr_ids, f"[{titr}] {itr}"
def op_name(op_name, record):
op_key = None
for kind_node in record["names"]["kind_names"]:
if kind_node["value"] == op_name:
op_key = kind_node["key"]
for op_node in record["names"]["operation_names"]:
if op_node["key"] == op_key:
return op_node
def get_operation(record, kind_name, op_name=None):
for idx, itr in enumerate(record["names"]):
if kind_name == itr["kind"]:
if op_name is None:
return idx, itr["operations"]
else:
for oidx, oname in enumerate(itr["operations"]):
if op_name == oname:
return oidx
return None
@@ -179,16 +178,14 @@ def test_scratch_memory_tracking(input_data):
assert len(scratch_callback_data) == 2 * len(scratch_buffer_data)
cb_op_names = op_name("SCRATCH_MEMORY", callback_records)["value"]
bf_op_names = op_name("SCRATCH_MEMORY", buffer_records)["value"]
_, cb_op_names = get_operation(callback_records, "SCRATCH_MEMORY")
_, bf_op_names = get_operation(buffer_records, "SCRATCH_MEMORY")
assert len(cb_op_names) == 4
assert len(bf_op_names) == 4
# op name -> enum value
scratch_cb_op_map = {node["value"]: node["key"] for node in cb_op_names}
scratch_bf_op_map = {node["value"]: node["key"] for node in bf_op_names}
assert scratch_cb_op_map == scratch_bf_op_map
assert cb_op_names == bf_op_names
scratch_reported_agent_ids = set()
detected_agents_ids = set(
@@ -253,10 +250,8 @@ def test_scratch_memory_tracking(input_data):
), f"this:\n{this_node}\n\nnext:\n{next_node}"
# alloc has more data vs free and async reclaim
scratch_alloc_node = (
this_node["operation"] == scratch_cb_op_map["SCRATCH_MEMORY_ALLOC"]
)
if scratch_alloc_node:
scratch_alloc_node = cb_op_names[this_node["operation"]]
if scratch_alloc_node == "SCRATCH_MEMORY_ALLOC":
assert (
pl(this_node)["queue_id"]["handle"]
== pl(next_node)["queue_id"]["handle"]
+2 -4
Ver Arquivo
@@ -40,9 +40,6 @@ rocprofiler_configure(uint32_t version,
uint32_t priority,
rocprofiler_client_id_t* id)
{
// only activate if main tool
if(priority > 0) return NULL;
// set the client name
id->name = "Test C tool";
@@ -52,8 +49,9 @@ rocprofiler_configure(uint32_t version,
uint32_t patch = version % 100;
// generate info string
printf("%s is using rocprofiler-sdk v%i.%i.%i (%s)\n",
printf("%s (priority=%u) is using rocprofiler-sdk v%i.%i.%i (%s)\n",
id->name,
priority,
major,
minor,
patch,
+23 -155
Ver Arquivo
@@ -34,6 +34,7 @@
#include "common/defines.hpp"
#include "common/filesystem.hpp"
#include "common/hash.hpp"
#include "common/name_info.hpp"
#include "common/perfetto.hpp"
#include "common/serialization.hpp"
@@ -197,146 +198,14 @@ make_array(Tp&& arg, Args&&... args)
return std::array<Tp, N>{std::forward<Tp>(arg), std::forward<Args>(args)...};
}
using call_stack_t = std::vector<source_location>;
using buffer_kind_names_t = std::map<rocprofiler_buffer_tracing_kind_t, std::string>;
using buffer_kind_operation_names_t =
std::map<rocprofiler_buffer_tracing_kind_t, std::map<uint32_t, std::string>>;
using callback_kind_names_t = std::map<rocprofiler_callback_tracing_kind_t, std::string>;
using callback_kind_operation_names_t =
std::map<rocprofiler_callback_tracing_kind_t, std::map<uint32_t, std::string>>;
using call_stack_t = std::vector<source_location>;
using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t;
using kernel_symbol_map_t = std::unordered_map<rocprofiler_kernel_id_t, kernel_symbol_data_t>;
struct callback_name_info
{
callback_kind_names_t kind_names = {};
callback_kind_operation_names_t operation_names = {};
template <typename ArchiveT>
void save(ArchiveT& ar) const
{
ar(cereal::make_nvp("kind_names", kind_names));
ar(cereal::make_nvp("operation_names", operation_names));
}
};
struct buffer_name_info
{
buffer_kind_names_t kind_names = {};
buffer_kind_operation_names_t operation_names = {};
template <typename ArchiveT>
void save(ArchiveT& ar) const
{
ar(cereal::make_nvp("kind_names", kind_names));
ar(cereal::make_nvp("operation_names", operation_names));
}
};
rocprofiler_client_id_t* client_id = nullptr;
rocprofiler_client_finalize_t client_fini_func = nullptr;
callback_name_info
get_callback_tracing_names()
{
auto cb_name_info = callback_name_info{};
//
// callback for each kind operation
//
static auto tracing_kind_operation_cb =
[](rocprofiler_callback_tracing_kind_t kindv, uint32_t operation, void* data_v) {
auto* name_info_v = static_cast<callback_name_info*>(data_v);
const char* name = nullptr;
ROCPROFILER_CALL(rocprofiler_query_callback_tracing_kind_operation_name(
kindv, operation, &name, nullptr),
"query buffer tracing kind operation name");
if(name) name_info_v->operation_names[kindv][operation] = name;
return 0;
};
//
// callback for each buffer kind (i.e. domain)
//
static auto tracing_kind_cb = [](rocprofiler_callback_tracing_kind_t kind, void* data) {
// store the buffer kind name
auto* name_info_v = static_cast<callback_name_info*>(data);
const char* name = nullptr;
ROCPROFILER_CALL(rocprofiler_query_callback_tracing_kind_name(kind, &name, nullptr),
"query buffer tracing kind operation name");
if(name) name_info_v->kind_names[kind] = name;
rocprofiler_iterate_callback_tracing_kind_operations(
kind, tracing_kind_operation_cb, static_cast<void*>(data));
return 0;
};
ROCPROFILER_CALL(rocprofiler_iterate_callback_tracing_kinds(tracing_kind_cb,
static_cast<void*>(&cb_name_info)),
"iterating buffer tracing kinds");
return cb_name_info;
}
buffer_name_info
get_buffer_tracing_names()
{
static const auto supported = std::unordered_set<rocprofiler_buffer_tracing_kind_t>{
ROCPROFILER_BUFFER_TRACING_HSA_CORE_API,
ROCPROFILER_BUFFER_TRACING_HSA_AMD_EXT_API,
ROCPROFILER_BUFFER_TRACING_HSA_IMAGE_EXT_API,
ROCPROFILER_BUFFER_TRACING_HSA_FINALIZE_EXT_API,
ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API,
ROCPROFILER_BUFFER_TRACING_HIP_COMPILER_API,
ROCPROFILER_BUFFER_TRACING_MARKER_CORE_API,
ROCPROFILER_BUFFER_TRACING_MARKER_CONTROL_API,
ROCPROFILER_BUFFER_TRACING_MARKER_NAME_API,
ROCPROFILER_BUFFER_TRACING_MEMORY_COPY,
ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY,
ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION,
};
auto cb_name_info = buffer_name_info{};
//
// callback for each kind operation
//
static auto tracing_kind_operation_cb =
[](rocprofiler_buffer_tracing_kind_t kindv, uint32_t operation, void* data_v) {
auto* name_info_v = static_cast<buffer_name_info*>(data_v);
const char* name = nullptr;
ROCPROFILER_CALL(rocprofiler_query_buffer_tracing_kind_operation_name(
kindv, operation, &name, nullptr),
"query buffer tracing kind operation name");
if(name) name_info_v->operation_names[kindv][operation] = name;
return 0;
};
//
// callback for each buffer kind (i.e. domain)
//
static auto tracing_kind_cb = [](rocprofiler_buffer_tracing_kind_t kind, void* data) {
// store the buffer kind name
auto* name_info_v = static_cast<buffer_name_info*>(data);
const char* name = nullptr;
ROCPROFILER_CALL(rocprofiler_query_buffer_tracing_kind_name(kind, &name, nullptr),
"query buffer tracing kind operation name");
if(name) name_info_v->kind_names[kind] = name;
rocprofiler_iterate_buffer_tracing_kind_operations(
kind, tracing_kind_operation_cb, static_cast<void*>(data));
return 0;
};
ROCPROFILER_CALL(rocprofiler_iterate_buffer_tracing_kinds(tracing_kind_cb,
static_cast<void*>(&cb_name_info)),
"iterating buffer tracing kinds");
return cb_name_info;
}
using callback_payload_t =
std::variant<rocprofiler_callback_tracing_code_object_load_data_t,
rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t,
@@ -1569,12 +1438,12 @@ write_json(call_stack_t* _call_stack)
{
using JSONOutputArchive = cereal::MinimalJSONOutputArchive;
constexpr auto json_prec = 32;
constexpr auto json_indent = JSONOutputArchive::Options::IndentChar::space;
auto json_opts = JSONOutputArchive::Options{json_prec, json_indent, 1};
auto json_ar = JSONOutputArchive{*ofs, json_opts};
auto buffer_name_info = get_buffer_tracing_names();
auto callback_name_info = get_callback_tracing_names();
constexpr auto json_prec = 32;
constexpr auto json_indent = JSONOutputArchive::Options::IndentChar::space;
auto json_opts = JSONOutputArchive::Options{json_prec, json_indent, 1};
auto json_ar = JSONOutputArchive{*ofs, json_opts};
auto buffer_names = rocprofiler::sdk::get_buffer_tracing_names();
auto callbk_names = rocprofiler::sdk::get_callback_tracing_names();
auto validate_page_migration =
(page_migration_status != ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL);
@@ -1598,7 +1467,7 @@ write_json(call_stack_t* _call_stack)
json_ar.startNode();
try
{
json_ar(cereal::make_nvp("names", callback_name_info));
json_ar(cereal::make_nvp("names", callbk_names));
json_ar(cereal::make_nvp("code_objects", code_object_records));
json_ar(cereal::make_nvp("kernel_symbols", kernel_symbol_records));
json_ar(cereal::make_nvp("hsa_api_traces", hsa_api_cb_records));
@@ -1619,7 +1488,7 @@ write_json(call_stack_t* _call_stack)
json_ar.startNode();
try
{
json_ar(cereal::make_nvp("names", buffer_name_info));
json_ar(cereal::make_nvp("names", buffer_names));
json_ar(cereal::make_nvp("kernel_dispatches", kernel_dispatch_bf_records));
json_ar(cereal::make_nvp("memory_copies", memory_copy_bf_records));
json_ar(cereal::make_nvp("scratch_memory_traces", scratch_memory_records));
@@ -1786,12 +1655,12 @@ write_perfetto()
}
{
auto buffer_name_info = get_buffer_tracing_names();
auto callbk_name_info = get_callback_tracing_names();
auto buffer_names = rocprofiler::sdk::get_buffer_tracing_names();
auto callbk_name_info = rocprofiler::sdk::get_callback_tracing_names();
for(auto itr : hsa_api_bf_records)
{
auto& name = buffer_name_info.operation_names.at(itr.kind).at(itr.operation);
auto name = buffer_names.at(itr.kind, itr.operation);
auto& track = thread_tracks.at(itr.thread_id);
auto _args = callback_arg_array_t{};
@@ -1803,7 +1672,7 @@ write_perfetto()
if(ritr != hsa_api_cb_records.end()) _args = ritr->args;
TRACE_EVENT_BEGIN(rocprofiler::trait::name<rocprofiler::category::hsa_api>::value,
::perfetto::StaticString(name.c_str()),
::perfetto::StaticString(name.data()),
track,
itr.start_timestamp,
::perfetto::Flow::ProcessScoped(itr.correlation_id.internal),
@@ -1830,7 +1699,7 @@ write_perfetto()
for(auto itr : hip_api_bf_records)
{
auto& name = buffer_name_info.operation_names.at(itr.kind).at(itr.operation);
auto name = buffer_names.at(itr.kind, itr.operation);
auto& track = thread_tracks.at(itr.thread_id);
auto _args = callback_arg_array_t{};
@@ -1842,7 +1711,7 @@ write_perfetto()
if(ritr != hip_api_cb_records.end()) _args = ritr->args;
TRACE_EVENT_BEGIN(rocprofiler::trait::name<rocprofiler::category::hip_api>::value,
::perfetto::StaticString(name.c_str()),
::perfetto::StaticString(name.data()),
track,
itr.start_timestamp,
::perfetto::Flow::ProcessScoped(itr.correlation_id.internal),
@@ -1869,11 +1738,11 @@ write_perfetto()
for(auto itr : memory_copy_bf_records)
{
auto& name = buffer_name_info.operation_names.at(itr.kind).at(itr.operation);
auto name = buffer_names.at(itr.kind, itr.operation);
auto& track = agent_tracks.at(itr.dst_agent_id.handle);
TRACE_EVENT_BEGIN(rocprofiler::trait::name<rocprofiler::category::memory_copy>::value,
::perfetto::StaticString(name.c_str()),
::perfetto::StaticString(name.data()),
track,
itr.start_timestamp,
::perfetto::Flow::ProcessScoped(itr.correlation_id.internal),
@@ -1886,7 +1755,9 @@ write_perfetto()
"src_agent",
agents_map.at(itr.src_agent_id).logical_node_id,
"dst_agent",
agents_map.at(itr.dst_agent_id).logical_node_id);
agents_map.at(itr.dst_agent_id).logical_node_id,
"copy_bytes",
itr.bytes);
TRACE_EVENT_END(rocprofiler::trait::name<rocprofiler::category::memory_copy>::value,
track,
itr.end_timestamp,
@@ -2070,9 +1941,6 @@ rocprofiler_configure(uint32_t version,
uint32_t priority,
rocprofiler_client_id_t* id)
{
// only activate if main tool
if(priority > 0) return nullptr;
// set the client name
id->name = "rocprofiler-sdk-json-tool";
@@ -2086,8 +1954,8 @@ rocprofiler_configure(uint32_t version,
// generate info string
auto info = std::stringstream{};
info << id->name << " is using rocprofiler-sdk v" << major << "." << minor << "." << patch
<< " (" << runtime_version << ")";
info << id->name << " (priority=" << priority << ") is using rocprofiler-sdk v" << major << "."
<< minor << "." << patch << " (" << runtime_version << ")";
std::clog << info.str() << std::endl;