Files
rocm-systems/projects/rocprofiler-sdk/samples/api_buffered_tracing/main.cpp
T
Jonathan R. Madsen 18da0bd49d Contexts, tracing, include reorg, registration, thread-pool (#65)
* Update scripts/update-doxygen.sh

- ensure build-docs folder exists

* Update scripts/run-ci.py

- exclude files in details subdirectory from code coverage

* Update scripts/thread-sanitizer-suppr.txt

- exclude races in glog

* Update docs/rocprofiler.dox.in

- exclude defines in include/rocprofiler/defines.h from doxygen
- Tweak EXCLUDE_PATTERNS and EXAMPLE_PATTERNS

* Update docs workflow

- trigger workflow whenever there is a change to the public headers (which may be doxygen comments)

* Update include/rocprofiler (reorg and overhaul)

- rocprofiler_status_t additions
  - CONTEXT_NOT_FOUND
  - CONTEXT_ERROR
  - INVALID_CONTEXT_ID
  - INVALID_CONTEXT
  - BUFFER_BUSY
- rocprofiler_context_is_active func
- rocprofiler_context_is_valid func
- rocprofiler_service_callback_tracing_kind_t update
  - remove ROCPROFILER_SERVICE_CALLBACK_TRACING_HELPER_THREAD
- Remove rocprofiler_tracing_helper_thread_operation_t
- Remove rocprofiler_helper_thread_callback_tracer_data_t
- Added rocprofiler_internal_thread_library_t
- Added rocprofiler_at_internal_thread_create
- split rocprofiler.h into several smaller headers
- reworked rocprofiler_status_t values
- added doxygen comments for enums
- replaced rocprofiler_trace_record_operation_kind_t with rocprofiler_trace_operation_t
- use @ instead of / in doxygen comment in rocprofiler_plugin.h
- fix ref to ROCPROFILER_SERVICE_CALLBACK_TRACING_MARKER_API
- end group in fwd.h
- remove PROFILE_COUNTING group in dispatch_profile.h
- remove premature group close in callback_tracing.h
- hsa.h: remove rocprofiler_hsa_trace_data_t
- fwd.h: remove rocprofiler_tracer_callback_data_t
- rename rocprofiler_correlation_id_t.handle to rocprofiler_correlation_id_t.id (consistency)
- fwd.h: add rocprofiler_callback_tracing_record_t
- callback_tracing.h: update rocprofiler_hsa_api_callback_tracer_data_t
- callback_tracing.h: add size fields
- simplify rocprofiler_tracer_callback_t
- removed ROCPROFILER_NONNULL from rocprofiler_get_version
- added rocprofiler_get_timestamp
- ROCPROFILER_STATUS_ERROR_CONFIGURATION_LOCKED in rocprofiler_status_t
- add ROCPROFILER_STATUS_ERROR_THREAD_NOT_FOUND rocprofiler_status_t
- add rocprofiler_buffer_category_t
- rocprofiler_trace_operation_t -> rocprofiler_tracing_operation_t
- rocprofiler_user_data_t union
- tweak rocprofiler_callback_tracing_record_t
  - make external_correlation_id non-pointer
  - add rocprofiler_user_data_t data field
- tweak rocprofiler_record_header_t
  - instead of single uint64_t kind field, have union for category + kind (two u32) with u64 hash
- API extensions for kind id <-> kind string
- API extensions for operation id <-> operation string
- rocprofiler_callback_trace_kind_name_cb_t
- rocprofiler_callback_trace_operation_name_cb_t
- rocprofiler_iterate_callback_trace_kind_names
- rocprofiler_iterate_callback_trace_kind_operation_names
- modify rocprofiler_hsa_api_callback_tracer_data_t data members (remove pointers)
- add rocprofiler_callback_trace_operation_args_cb_t function pointer typedef
- add rocprofiler_iterate_callback_trace_operation_args function
- fixed inconsistent use of *_trace_* vs. *_tracing_* (opting for tracing)
- removed rocprofiler_query_callback_trace_kind_name
- removed rocprofiler_query_callback_kind_operation_name
- Add include/rocprofiler/registration.h
  - header dedicated to registering a tool/client with rocprofiler
  - this header is not intended to be included by rocprofiler.h
  - rocprofiler_client_id_t
    - identifier for client tool
  - rocprofiler_client_finalize_t
    - function pointer prototype for tool-initiated finalization
  - rocprofiler_tool_initialize_t
    - function pointer prototype for tool initialization (i.e. configuration)
  - rocprofiler_tool_finalize_t
    - function pointer prototype for tool finalization
  - rocprofiler_tool_configure_result_t
    - struct returned by tool/client to rocprofiler
  - rocprofiler_is_initialized
    - function for querying whether tool-induced initialization is possible
  - rocprofiler_is_finalized
    - function for querying whether rocprofiler has been finalized
  - rocprofiler_configure prototype
    - this is the function tools implement
    - prototype is always marked as having default visibility
    - no implementation in rocprofiler
  - added typedef for rocprofiler_configure function pointer
  - added rocprofiler_force_configure to explicitly invoke rocprofiler_configure instead of relying on lazy init
- made callback typedef names more consistent (_cb_t suffix)
- typedef for rocprofiler_internal_thread_library_cb_t function pointer
- added rocprofiler_at_internal_thread_create function
- added rocprofiler_callback_thread_t struct
- added rocprofiler_create_callback_thread function
- added rocprofiler_assign_callback_thread function
- removed rocprofiler_buffer_tracing_record_header_t in favor of kind and correlation id in each record type
- added rocprofiler_buffer_tracing_kind_name_cb_t typedef
- added rocprofiler_buffer_tracing_operation_name_cb_t typedef
- added rocprofiler_iterate_buffer_tracing_kind_names function
- added rocprofiler_iterate_buffer_tracing_kind_operation_names function
- removed rocprofiler_query_buffer_trace_kind_name function
- removed rocprofiler_query_buffer_kind_operation_name function

* Update lib/common/container/stable_vector.hpp

- include limits header
- reserve_size struct
- overload stable_vector constructor to support reserving as part of construction

* Update lib/common/container/record_header_buffer.{hpp,cpp}

- add emplace member function accepting category and kind (two u32 variables) instead of one u64 kind
- use std::shared_mutex to prevent data-race when reading m_headers
- record_header_buffer is now multiple writer, single reader
- add read_lock member function (shared)
- add read_unlock member function (shared)
- lock member function gets exclusive lock
- unlock member function releases exclusive lock

* Rename "config" to "context" + restructure + implement

- Restructure config files + license
  - move config files into lib/rocprofiler/config subfolder
  - rename some files
  - add license to some files which were missing it
- Rename config/helpers.hpp
  - rename to allocator.hpp
  - remove get_domain_max_ops
- Create config/domain.{hpp,cpp}
  - structures for handling tracing domains and ops
- Update config/config.{hpp,cpp}
  - buffer_instance struct
  - callback_tracing_service struct
  - buffer_tracing_service struct
  - config struct
  - allocate_{config,buffer} func
  - {validate,start,stop}_config funcs
  - get_registered_configs func
  - get_active_configs func
  - get_buffers func
- Update rocprofiler.cpp
  - Implement rocprofiler_create_context
  - Implement rocprofiler_start_context
  - Implement rocprofiler_stop_context
  - Implement rocprofiler_context_is_active
  - Implement rocprofiler_context_is_valid
  - Implement rocprofiler_flush_buffer
  - Implement rocprofiler_destroy_buffer
  - Implement rocprofiler_create_buffer
- Update lib/rocprofiler/hsa
  - use rocprofiler_tracer_activity_domain_t instead of rocprofiler_tracer_activity_domain_t
  - remove ROCPROFILER_TRACER_ACTIVITY_DOMAIN_HSA_API fromHSA_API_INFO_DEFINITION_* macros
- Update lib/rocprofiler/context/domain.*
  - fixes for domain_info (i.e. use correct enums)
  - update rocprofiler_status_t codes
  - fix template instantiations
- Update lib/rocprofiler/context/context.*
  - use rocprofiler_service_callback_tracing_kind_t instead of rocprofiler_tracer_activity_domain_t
  - rename correlation_context to correlation_tracing_service
  - fix domains in callback_tracing_service and buffer_tracing_service
  - unique_ptr for callback_tracer and buffered_tracer in context
- Update lib/rocprofiler/rocprofiler.cpp
  - implement rocprofiler_configure_callback_tracing_service
- Update lib/rocprofiler/hsa/ostream.hpp
  - include rocprofiler.h instead of tracer.hpp
- Update lib/rocprofiler/hsa
  - migration to use rocprofiler_hsa_api_callback_tracer_data_t instead of rocprofiler_hsa_trace_data_t
  - restructure hsa_api_impl<Idx>
    - remove phase_enter and phase_exit
    - add set_data_args (partial replacement for phase_enter)
    - functor handles the contexts
- Update lib/rocprofiler/rocprofiler.cpp
  - implement rocprofiler_get_version
- Update lib/rocprofiler/hsa/hsa.{hpp,cpp}
  - remove hsa_api_ prefix for functions already in hsa namespace
- Update lib/rocprofiler/context/context.{hpp,cpp}
  - add client_idx to context struct (tool identifier)
  - add push_client function to set client_idx before context is allocated
  - add pop_client function to remove client identifier from future context creations
  - implemented {registered,active}_contexts and buffers to use new container::reserve_size overload to stable_vector
  - fix implementation of start_context
  - fix implementation of stop_context
- Update lib/rocprofiler/rocprofiler.cpp
  - prevent context creation, buffer creation, pc sampling config, etc. after initialization
  - add nullptr checks to rocprofiler_context_is_valid
  - fix rocprofiler_configure_callback_tracing_service
    - was checking size of buffers, not registered context
  - implement rocprofiler_iterate_callback_trace_kind_names
  - implement rocprofiler_iterate_callback_trace_kind_operation_names
- Update lib/rocprofiler/CMakeLists.txt
  - add registration.{hpp,cpp} to rocprofiler-library target sources
- Update lib/rocprofiler/hsa/utils.hpp
  - fix using fmt::formt with const char* strings
  - remove join functions (no longer used)
- Update lib/rocprofiler/hsa/hsa.{hpp,cpp}
  - remove args_string function
  - remove named_args_string function
  - update iterate_args function
    - change callback type
    - accept user data
  - rework the hsa_api_impl<Idx>::functor function
    - save the rocprofiler_callback_tracing_record_t between callbacks
  - update update_table function
    - check buffered_tracer domains
  - remove comments
- Update lib/rocprofiler/hsa/defines.hpp
  - remove MEMBER_<N> macros
  - add ADDR_MEMBER_<N> macros
  - remove doxygen comments for GET_MEMBER_FIELDS
  - add GET_ADDR_MEMBER_FIELDS
  - update HSA_API_INFO_DEFINITION_{0,V}
    - rename domain_idx to callback_domain_idx
    - add buffered_domain_idx
    - add as_arg_addr function
- Update lib/rocprofiler/rocprofiler.cpp
  - implement rocprofiler_iterate_callback_trace_operation_args
- Remove lib/rocprofiler/tracing.{hpp,cpp} and lib/rocprofiler/CMakeLists.txt
  - unused
- Update lib/rocprofiler/hsa/hsa.{hpp,cpp}
  - support buffered tracing in hsa_api_impl<Idx>::functor
  - rocprofiler_callback_trace_operation_args_cb_t -> rocprofiler_callback_tracing_operation_args_cb_t
    - i.e. trace -> tracing
- Update lib/rocprofiler/context/context.{hpp,cpp}
  - removed buffer_instance struct
  - removed allocate_buffer function
  - removed get_buffers function
  - changed buffer_tracing_service::buffer_array_t
- Update lib/rocprofiler/hsa: hsa.cpp, ostream.hpp, details folder
  - move ostream.hpp into details folder to prevent from contributing to code coverage
  - update cmake build system for new directory

* Add lib/rocprofiler/registration.{hpp,cpp}

- implements rocprofiler_set_api_table (called by rocprofiler-register)
- miscellaneous functions for client configure/initialize/finalize
- functions for querying the init/fini status
- relocated OnLoad HSA workaround to this file
  - at present, this is used to workaround ROCr not having rocprofiler-register integration yet
- implement rocprofiler_force_configure function
- implement rocprofiler_is_initialized function
- implement rocprofiler_is_finalized function
- ensure configure functions only invoked once
- ensure internal thread creation notification functions are invoked
- get_status is pair of atomics
- fix heap-use-after-free in init_logging
- update finalize
  - invoke hsa_shut_down
  - set all active contexts to null pointers

* Add lib/rocprofiler/buffer_tracing.cpp

- contains implementations of buffer_tracing (i.e. rocprofiler/buffer_tracing.h)
- previous implementation may have been moved out of lib/rocprofiler/rocprofiler.cpp

* Add lib/rocprofiler/buffer.{hpp,cpp}

- contains implementations of buffer (i.e. rocprofiler/buffer.h) and misc internal access functions
- previous implementation may have been moved out of lib/rocprofiler/rocprofiler.cpp and lib/rocprofiler/context/context.{hpp,cpp}

* Add lib/rocprofiler/callback_tracing.cpp

- contains implementations of callback_tracing (i.e. rocprofiler/callback_tracing.h)
- previous implementation may have been moved out of lib/rocprofiler/rocprofiler.cpp

* Add lib/rocprofiler/context.cpp

- contains implementations of context public API functions (i.e. rocprofiler/context.h)
- previous implementation may have been moved out of lib/rocprofiler/rocprofiler.cpp

* Add lib/rocprofiler/internal_threading.{hpp,cpp}

- contains implementations of internal_threading (i.e. rocprofiler/internal_threading.h)
- also contains implementations of internal access functions
- update finalize function
  - join all task groups and destroy all thread pools first, then reset unique_ptr

* Update lib/rocprofiler/rocprofiler.cpp

- rocprofiler_get_version returns status
- implement rocprofiler_get_timestamp
- remove misc implementations that were split into other files

* Update lib/rocprofiler/CMakeLists.txt

- compile new implementation files
  - buffer.cpp
  - buffer_tracing.cpp
  - callback_tracing.cpp
  - context.cpp
  - internal_threading.cpp

* Update lib/tests/buffering/buffering-*.cpp

- update to reflect changes to rocprofiler_record_header_t

* Update CMakeLists.txt

- increase minimum cmake version to 3.21 which added HIP support as a language

* Add samples/apps/transpose

- simple HIP application for testing

* Add samples/api_callback_tracing

- HIP application and tool library
- This effectively demos how to setup HSA API tracing
  - For each function called in tool, it stores the func/file/line and prints it during finalization
- client.hpp and client.cpp are the tool library
- Implement use of rocprofiler_iterate_callback_trace_operation_args
- add demo of using rocprofiler_get_version
- add_test
  - remove PASS_REGULAR_EXPRESSION
    - causing false passes during memcheck
  - add ROCPROFILER_MEMCHECK_PRELOAD_ENV to environment
- check if rocprofiler is initialized before stopping context

* Add samples/api_buffered_tracing

- Sample demonstrating tracing the HSA API via buffering
- demo rocprofiler_record_header_compute_hash
- throw exceptions for unexpected buffer data
- add_test
  - remove PASS_REGULAR_EXPRESSION
    - causing false passes during memcheck
  - add ROCPROFILER_MEMCHECK_PRELOAD_ENV to environment

* Update samples/CMakeLists.txt

- add subdirectory for api_callback_tracing
- add subdirectory api_buffered_tracing

* Update samples/pc_sampling/common.h

- fix processing of headers

* Update lib/rocprofiler/hsa/details/ostream.hpp

- fix data race on HSA_depth_max_cnt and recursion
- HSA_depth_max_cnt and recursion is now thread-local static instead of global static
- replace std::string usage with std::string_view

* Actions update

- add dependabot.yml
- use actions/checkout@v4
- install latest libasan and libtsan in sanitizer containers

* Add PTL (Parallel Tasking Library) submodule

[ROCm/rocprofiler-sdk commit: d3eaacd610]
2023-09-20 19:32:02 -05:00

245 wiersze
8.6 KiB
C++

/*
Copyright (c) 2015-2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "client.hpp"
#include "hip/hip_runtime.h"
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <mutex>
#include <random>
#include <stdexcept>
#define HIP_API_CALL(CALL) \
{ \
hipError_t error_ = (CALL); \
if(error_ != hipSuccess) \
{ \
auto _hip_api_print_lk = auto_lock_t{print_lock}; \
fprintf(stderr, \
"%s:%d :: HIP error : %s\n", \
__FILE__, \
__LINE__, \
hipGetErrorString(error_)); \
throw std::runtime_error("hip_api_call"); \
} \
}
namespace
{
using auto_lock_t = std::unique_lock<std::mutex>;
auto print_lock = std::mutex{};
size_t nthreads = 2;
size_t nitr = 500;
size_t nsync = 10;
constexpr unsigned shared_mem_tile_dim = 32;
void
check_hip_error(void);
void
verify(int* in, int* out, int M, int N);
} // namespace
__global__ void
transpose_a(int* in, int* out, int M, int N);
void
run(int rank, int tid, hipStream_t stream, int argc, char** argv);
int
main(int argc, char** argv)
{
client::setup(); // forces rocprofiler to configure/initialize
client::start(); // starts context before any API tables are available
int rank = 0;
int size = 1;
for(int i = 1; i < argc; ++i)
{
auto _arg = std::string{argv[i]};
if(_arg == "?" || _arg == "-h" || _arg == "--help")
{
fprintf(stderr,
"usage: transpose [NUM_THREADS (%zu)] [NUM_ITERATION (%zu)] "
"[SYNC_EVERY_N_ITERATIONS (%zu)]\n",
nthreads,
nitr,
nsync);
exit(EXIT_SUCCESS);
}
}
if(argc > 1) nthreads = atoll(argv[1]);
if(argc > 2) nitr = atoll(argv[2]);
if(argc > 3) nsync = atoll(argv[3]);
printf("[transpose] Number of threads: %zu\n", nthreads);
printf("[transpose] Number of iterations: %zu\n", nitr);
printf("[transpose] Syncing every %zu iterations\n", nsync);
// this is a temporary workaround in omnitrace when HIP + MPI is enabled
int ndevice = 0;
int devid = rank;
HIP_API_CALL(hipGetDeviceCount(&ndevice));
printf("[transpose] Number of devices found: %i\n", ndevice);
if(ndevice > 0)
{
devid = rank % ndevice;
HIP_API_CALL(hipSetDevice(devid));
printf("[transpose] Rank %i assigned to device %i\n", rank, devid);
}
if(rank == devid && rank < ndevice)
{
std::vector<std::thread> _threads{};
std::vector<hipStream_t> _streams(nthreads);
for(size_t i = 0; i < nthreads; ++i)
HIP_API_CALL(hipStreamCreate(&_streams.at(i)));
for(size_t i = 1; i < nthreads; ++i)
_threads.emplace_back(run, rank, i, _streams.at(i), argc, argv);
run(rank, 0, _streams.at(0), argc, argv);
for(auto& itr : _threads)
itr.join();
for(size_t i = 0; i < nthreads; ++i)
HIP_API_CALL(hipStreamDestroy(_streams.at(i)));
}
HIP_API_CALL(hipDeviceSynchronize());
HIP_API_CALL(hipDeviceReset());
client::stop();
client::shutdown();
return 0;
}
__global__ void
transpose_a(int* in, int* out, int M, int N)
{
__shared__ int tile[shared_mem_tile_dim][shared_mem_tile_dim];
int idx = (blockIdx.y * blockDim.y + threadIdx.y) * M + blockIdx.x * blockDim.x + threadIdx.x;
tile[threadIdx.y][threadIdx.x] = in[idx];
__syncthreads();
idx = (blockIdx.x * blockDim.x + threadIdx.y) * N + blockIdx.y * blockDim.y + threadIdx.x;
out[idx] = tile[threadIdx.x][threadIdx.y];
}
void
run(int rank, int tid, hipStream_t stream, int argc, char** argv)
{
unsigned int M = 4960 * 2;
unsigned int N = 4960 * 2;
if(argc > 2) nitr = atoll(argv[2]);
if(argc > 3) nsync = atoll(argv[3]);
auto_lock_t _lk{print_lock};
std::cout << "[" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl;
_lk.unlock();
std::default_random_engine _engine{std::random_device{}() * (rank + 1) * (tid + 1)};
std::uniform_int_distribution<int> _dist{0, 1000};
size_t size = sizeof(int) * M * N;
int* inp_matrix = new int[size];
int* out_matrix = new int[size];
for(size_t i = 0; i < M * N; i++)
{
inp_matrix[i] = _dist(_engine);
out_matrix[i] = 0;
}
int* in = nullptr;
int* out = nullptr;
HIP_API_CALL(hipMalloc(&in, size));
HIP_API_CALL(hipMalloc(&out, size));
HIP_API_CALL(hipMemsetAsync(in, 0, size, stream));
HIP_API_CALL(hipMemsetAsync(out, 0, size, stream));
HIP_API_CALL(hipMemcpyAsync(in, inp_matrix, size, hipMemcpyHostToDevice, stream));
HIP_API_CALL(hipStreamSynchronize(stream));
dim3 grid(M / 32, N / 32, 1);
dim3 block(32, 32, 1); // transpose_a
auto t1 = std::chrono::high_resolution_clock::now();
for(size_t i = 0; i < nitr; ++i)
{
transpose_a<<<grid, block, 0, stream>>>(in, out, M, N);
check_hip_error();
if(i % nsync == (nsync - 1)) HIP_API_CALL(hipStreamSynchronize(stream));
}
auto t2 = std::chrono::high_resolution_clock::now();
HIP_API_CALL(hipStreamSynchronize(stream));
HIP_API_CALL(hipMemcpyAsync(out_matrix, out, size, hipMemcpyDeviceToHost, stream));
double time = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count();
float GB = (float) size * nitr * 2 / (1 << 30);
print_lock.lock();
std::cout << "[" << rank << "][" << tid << "] Runtime of transpose is " << time << " sec\n"
<< "The average performance of transpose is " << GB / time << " GBytes/sec"
<< std::endl;
print_lock.unlock();
HIP_API_CALL(hipStreamSynchronize(stream));
// cpu_transpose(matrix, out_matrix, M, N);
verify(inp_matrix, out_matrix, M, N);
HIP_API_CALL(hipFree(in));
HIP_API_CALL(hipFree(out));
delete[] inp_matrix;
delete[] out_matrix;
}
namespace
{
void
check_hip_error(void)
{
hipError_t err = hipGetLastError();
if(err != hipSuccess)
{
auto_lock_t _lk{print_lock};
std::cerr << "Error: " << hipGetErrorString(err) << std::endl;
throw std::runtime_error("hip_api_call");
}
}
void
verify(int* in, int* out, int M, int N)
{
for(int i = 0; i < 10; i++)
{
int row = rand() % M;
int col = rand() % N;
if(in[row * N + col] != out[col * M + row])
{
auto_lock_t _lk{print_lock};
std::cout << "mismatch: " << row << ", " << col << " : " << in[row * N + col] << " | "
<< out[col * M + row] << "\n";
}
}
}
} // namespace