From 407fc57edef56dcd279df70105bebe80ef207b1a Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Thu, 7 Mar 2024 22:21:26 -0600 Subject: [PATCH] Shared Library Constructor (rocprofv3 deadlock fix) (#599) * Moved tests/apps to tests/bin * Renamed cmake project in tests/bin * Update samples - Use ROCPROFILER_DEFAULT_FAIL_REGEX - tweaks to stdout messages * Update tests - Use ROCPROFILER_DEFAULT_FAIL_REGEX * Add tests/lib - libraries with HIP code * Update PTL submodule - remove atexit delete of thread_id_map * Update cmake/rocprofiler_options.cmake - Set ROCPROFILER_DEFAULT_FAIL_REGEX * Update common lib: env + logging - improved customization of logging settings - default to disabling logging to files - install failure handler for rocprofv3 - set_env support in environment.* * Add lib/rocprofiler-sdk/shared_library.cpp - shared library constructor * Update lib/rocprofiler-sdk-tool/tool.cpp - destructor thread safety - convert callback_name_info and buffered_name_info to pointers - install failure handler for logging * Add tests/bin/hip-in-libraries - hip-in-libraries is an exe which uses two shared libraries where each shared library contains HIP kernels - used for testing deadlocking within __hipRegisterFatBinary * Update bin/rocprofv3 - reorganized the env variables - use exec to launch command - set ROCPROFILER_LIBRARY_CTOR=1 * Add tests/rocprofv3/tracing-hip-in-libraries - uses hip-in-libraries exe for exe which uses shared libraries to launch HIP kernels * Update bin/rocprofv3 - fix counter collection (no exec) * Update lib/rocprofiler-sdk-tool/tool.cpp - replace "Kernel-Name" with "Kernel_Name" * Update lib/rocprofiler-sdk/registration.cpp Use RTLD_LOCAL instead of RTLD_GLOBAL for env libraries * Update tests/rocprofv3 - replace "Kernel-Name" with "Kernel_Name" * Update tests - vector-ops (bin) stream syncs + runs with 4 queues per device - improve counter-collection/input1 validation - rocprofv3/tracing-hip-in-libraries does not do sys-trace - improved validation script for tracing-hip-in-libraries - updated dispatch_callback in json-tool.cpp following reworking of prototypes for counter collection * Update samples/counter_collection - updated dispatch_callback(s) and record_callback(s) following reworking of prototypes * Update bin/rocprofv3 - reorganized help menu - added options for sub-HSA tables - added --hip-runtime-trace - changed --hip-trace to include --hip-compiler-trace * Update lib/rocprofiler-sdk-tool - improved kernel filtering - removed arch_vgpr, accum_vgpr, sgpr code (in rocprofiler-sdk) - fixed issue with counter-collection w/o tracing - added support for fine grained HSA API tracing - removed directly linking to HSA-runtime * Update lib/rocprofiler-sdk/agent.cpp - rocp_agents != hsa_agents is non-fatal when ROCPROFILER_BUILD_CI=OFF (CMake option) * GPR (vector and scalar) info in kernel symbol data - rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t contains general purpose register info * Header include order fix - Include repo headers first - Third party library headers next - standard library headers last * Update dispatch profiling public API - introduce rocprofiler_profile_counting_dispatch_data_t - change signature of rocprofiler_profile_counting_dispatch_callback_t and rocprofiler_profile_counting_record_callback_t - provide rocprofiler_user_data_t pointer in dispatch callback - provide rocprofiler_user_data_t value (from dispatch cb) in record callback * Update tests/bin/CMakeLists.txt - fix add_subdirectory(hip-in-libraries) order * Update VERSION - bump to 0.2.0 in prep for AFAR [ROCm/rocprofiler-sdk commit: 7b6d3c70bdcfb1d602ed42824bb7325fa668fe8f] --- projects/rocprofiler-sdk/VERSION | 2 +- .../cmake/rocprofiler_options.cmake | 5 + projects/rocprofiler-sdk/external/ptl | 2 +- .../api_buffered_tracing/CMakeLists.txt | 2 +- .../samples/api_buffered_tracing/main.cpp | 10 +- .../api_callback_tracing/CMakeLists.txt | 2 +- .../samples/api_callback_tracing/main.cpp | 8 +- .../code_object_tracing/CMakeLists.txt | 2 +- .../samples/code_object_tracing/main.cpp | 10 +- .../samples/common/CMakeLists.txt | 5 + .../samples/counter_collection/CMakeLists.txt | 6 +- .../counter_collection/callback_client.cpp | 39 +- .../samples/counter_collection/client.cpp | 25 +- .../print_functional_counters.cpp | 64 ++- .../samples/intercept_table/CMakeLists.txt | 2 +- .../samples/intercept_table/main.cpp | 2 +- projects/rocprofiler-sdk/source/bin/rocprofv3 | 123 +++-- .../rocprofiler-sdk/callback_tracing.h | 11 +- .../rocprofiler-sdk/dispatch_profile.h | 79 +-- .../source/include/rocprofiler-sdk/fwd.h | 14 +- .../source/lib/common/environment.cpp | 25 +- .../source/lib/common/environment.hpp | 14 + .../source/lib/common/logging.cpp | 58 ++- .../source/lib/common/logging.hpp | 18 +- .../lib/rocprofiler-sdk-tool/CMakeLists.txt | 1 - .../lib/rocprofiler-sdk-tool/config.hpp | 29 +- .../lib/rocprofiler-sdk-tool/helper.cpp | 195 +------- .../lib/rocprofiler-sdk-tool/helper.hpp | 63 --- .../source/lib/rocprofiler-sdk-tool/tool.cpp | 460 +++++++++++------- .../source/lib/rocprofiler-sdk/CMakeLists.txt | 5 +- .../source/lib/rocprofiler-sdk/agent.cpp | 10 +- .../lib/rocprofiler-sdk/counters/core.cpp | 62 ++- .../lib/rocprofiler-sdk/counters/core.hpp | 3 +- .../rocprofiler-sdk/counters/tests/core.cpp | 153 +++--- .../counters/tests/dimension.cpp | 18 +- .../counters/tests/init_order.cpp | 34 +- .../lib/rocprofiler-sdk/hsa/code_object.cpp | 184 ++++++- .../source/lib/rocprofiler-sdk/hsa/queue.cpp | 10 +- .../source/lib/rocprofiler-sdk/hsa/queue.hpp | 4 +- .../lib/rocprofiler-sdk/profile_config.cpp | 5 +- .../lib/rocprofiler-sdk/registration.cpp | 7 +- .../lib/rocprofiler-sdk/shared_library.cpp | 84 ++++ .../lib/rocprofiler-sdk/tests/common.hpp | 2 +- .../lib/rocprofiler-sdk/tests/roctx.cpp | 12 +- projects/rocprofiler-sdk/tests/CMakeLists.txt | 5 +- .../tests/async-copy-tracing/CMakeLists.txt | 5 +- .../tests/{apps => bin}/CMakeLists.txt | 3 +- .../tests/bin/hip-in-libraries/CMakeLists.txt | 34 ++ .../bin/hip-in-libraries/hip-in-libraries.cpp | 151 ++++++ .../{apps => bin}/multistream/CMakeLists.txt | 2 +- .../multistream/multistream_app.cpp | 0 .../reproducible-runtime/CMakeLists.txt | 2 +- .../reproducible-runtime.cpp | 0 .../simple-transpose/CMakeLists.txt | 2 +- .../simple-transpose/simple-transpose.cpp | 0 .../{apps => bin}/transpose/CMakeLists.txt | 2 +- .../{apps => bin}/transpose/transpose.cpp | 8 +- .../vector-operations/CMakeLists.txt | 2 +- .../vector-operations/vector-ops.cpp | 114 +++-- .../tests/c-tool/CMakeLists.txt | 2 +- .../tests/common/CMakeLists.txt | 5 + .../tests/counter-collection/CMakeLists.txt | 4 +- .../tests/kernel-tracing/CMakeLists.txt | 4 +- .../rocprofiler-sdk/tests/lib/CMakeLists.txt | 18 + .../tests/lib/transpose/CMakeLists.txt | 61 +++ .../tests/lib/transpose/transpose.cpp | 260 ++++++++++ .../tests/lib/transpose/transpose.hpp | 28 ++ .../lib/vector-operations/CMakeLists.txt | 49 ++ .../lib/vector-operations/vector-ops.cpp | 291 +++++++++++ .../lib/vector-operations/vector-ops.hpp | 26 + .../tests/rocprofv3/CMakeLists.txt | 3 +- .../counter-collection/CMakeLists.txt | 4 + .../counter-collection/input1/CMakeLists.txt | 4 +- .../counter-collection/input1/validate.py | 44 +- .../counter-collection/input2/CMakeLists.txt | 4 +- .../counter-collection/input2/validate.py | 2 +- .../list_metrics/CMakeLists.txt | 4 +- .../rocprofv3/counter-collection/pytest.ini | 1 + .../tracing-hip-in-libraries/CMakeLists.txt | 72 +++ .../tracing-hip-in-libraries/conftest.py | 94 ++++ .../tracing-hip-in-libraries/validate.py | 142 ++++++ .../rocprofv3/tracing-plus-cc/CMakeLists.txt | 4 +- .../rocprofv3/tracing-plus-cc/validate.py | 2 +- .../tests/rocprofv3/tracing/validate.py | 2 +- .../rocprofiler-sdk/tests/tools/json-tool.cpp | 24 +- 85 files changed, 2497 insertions(+), 856 deletions(-) create mode 100644 projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/shared_library.cpp rename projects/rocprofiler-sdk/tests/{apps => bin}/CMakeLists.txt (88%) create mode 100644 projects/rocprofiler-sdk/tests/bin/hip-in-libraries/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/tests/bin/hip-in-libraries/hip-in-libraries.cpp rename projects/rocprofiler-sdk/tests/{apps => bin}/multistream/CMakeLists.txt (95%) rename projects/rocprofiler-sdk/tests/{apps => bin}/multistream/multistream_app.cpp (100%) rename projects/rocprofiler-sdk/tests/{apps => bin}/reproducible-runtime/CMakeLists.txt (96%) rename projects/rocprofiler-sdk/tests/{apps => bin}/reproducible-runtime/reproducible-runtime.cpp (100%) rename projects/rocprofiler-sdk/tests/{apps => bin}/simple-transpose/CMakeLists.txt (95%) rename projects/rocprofiler-sdk/tests/{apps => bin}/simple-transpose/simple-transpose.cpp (100%) rename projects/rocprofiler-sdk/tests/{apps => bin}/transpose/CMakeLists.txt (96%) rename projects/rocprofiler-sdk/tests/{apps => bin}/transpose/transpose.cpp (96%) rename projects/rocprofiler-sdk/tests/{apps => bin}/vector-operations/CMakeLists.txt (94%) rename projects/rocprofiler-sdk/tests/{apps => bin}/vector-operations/vector-ops.cpp (75%) create mode 100644 projects/rocprofiler-sdk/tests/lib/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/tests/lib/transpose/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/tests/lib/transpose/transpose.cpp create mode 100644 projects/rocprofiler-sdk/tests/lib/transpose/transpose.hpp create mode 100644 projects/rocprofiler-sdk/tests/lib/vector-operations/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/tests/lib/vector-operations/vector-ops.cpp create mode 100644 projects/rocprofiler-sdk/tests/lib/vector-operations/vector-ops.hpp create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/tracing-hip-in-libraries/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/tracing-hip-in-libraries/conftest.py create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/tracing-hip-in-libraries/validate.py diff --git a/projects/rocprofiler-sdk/VERSION b/projects/rocprofiler-sdk/VERSION index 6e8bf73aa5..0ea3a944b3 100644 --- a/projects/rocprofiler-sdk/VERSION +++ b/projects/rocprofiler-sdk/VERSION @@ -1 +1 @@ -0.1.0 +0.2.0 diff --git a/projects/rocprofiler-sdk/cmake/rocprofiler_options.cmake b/projects/rocprofiler-sdk/cmake/rocprofiler_options.cmake index fe3b073b57..9a08845052 100644 --- a/projects/rocprofiler-sdk/cmake/rocprofiler_options.cmake +++ b/projects/rocprofiler-sdk/cmake/rocprofiler_options.cmake @@ -131,3 +131,8 @@ if(ASAN) endif() include(rocprofiler_memcheck) + +# default FAIL_REGULAR_EXPRESSION for tests +set(ROCPROFILER_DEFAULT_FAIL_REGEX + "threw an exception|Permission denied|Could not create logging file" + CACHE STRING "Default FAIL_REGULAR_EXPRESSION for tests") diff --git a/projects/rocprofiler-sdk/external/ptl b/projects/rocprofiler-sdk/external/ptl index 12ca26ac2b..48df416254 160000 --- a/projects/rocprofiler-sdk/external/ptl +++ b/projects/rocprofiler-sdk/external/ptl @@ -1 +1 @@ -Subproject commit 12ca26ac2b3091c8dca8e65df73b4dca8b43ce6a +Subproject commit 48df41625430d27ce43cf197fd467a8dda87cb45 diff --git a/projects/rocprofiler-sdk/samples/api_buffered_tracing/CMakeLists.txt b/projects/rocprofiler-sdk/samples/api_buffered_tracing/CMakeLists.txt index e3cf49a342..ab6a8034af 100644 --- a/projects/rocprofiler-sdk/samples/api_buffered_tracing/CMakeLists.txt +++ b/projects/rocprofiler-sdk/samples/api_buffered_tracing/CMakeLists.txt @@ -55,4 +55,4 @@ set_tests_properties( ENVIRONMENT "${ROCPROFILER_MEMCHECK_PRELOAD_ENV};HSA_TOOLS_LIB=$" FAIL_REGULAR_EXPRESSION - "threw an exception") + "${ROCPROFILER_DEFAULT_FAIL_REGEX}") diff --git a/projects/rocprofiler-sdk/samples/api_buffered_tracing/main.cpp b/projects/rocprofiler-sdk/samples/api_buffered_tracing/main.cpp index 1c7e73cebb..c87ccff4d7 100644 --- a/projects/rocprofiler-sdk/samples/api_buffered_tracing/main.cpp +++ b/projects/rocprofiler-sdk/samples/api_buffered_tracing/main.cpp @@ -155,7 +155,7 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv) if(argc > 3) nsync = atoll(argv[3]); auto_lock_t _lk{print_lock}; - std::cout << "[" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl; + std::cout << "[transpose][" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl; _lk.unlock(); std::default_random_engine _engine{std::random_device{}() * (rank + 1) * (tid + 1)}; @@ -183,7 +183,7 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv) dim3 block(32, 32, 1); // transpose_a print_lock.lock(); - printf("[%i][%i] grid=(%i,%i,%i), block=(%i,%i,%i)\n", + printf("[transpose][%i][%i] grid=(%i,%i,%i), block=(%i,%i,%i)\n", rank, tid, grid.x, @@ -208,8 +208,10 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv) float GB = (float) size * nitr * 2 / (1 << 30); print_lock.lock(); - std::cout << "[" << rank << "][" << tid << "] Runtime of transpose is " << time << " sec\n" - << "The average performance of transpose is " << GB / time << " GBytes/sec" + std::cout << "[transpose][" << rank << "][" << tid << "] Runtime of transpose is " << time + << " sec\n"; + std::cout << "[transpose][" << rank << "][" << tid + << "] The average performance of transpose is " << GB / time << " GBytes/sec" << std::endl; print_lock.unlock(); diff --git a/projects/rocprofiler-sdk/samples/api_callback_tracing/CMakeLists.txt b/projects/rocprofiler-sdk/samples/api_callback_tracing/CMakeLists.txt index 65c652e2f7..9b5c77ba95 100644 --- a/projects/rocprofiler-sdk/samples/api_callback_tracing/CMakeLists.txt +++ b/projects/rocprofiler-sdk/samples/api_callback_tracing/CMakeLists.txt @@ -57,4 +57,4 @@ set(callback-api-tracing-env set_tests_properties( callback-api-tracing PROPERTIES TIMEOUT 45 LABELS "samples" ENVIRONMENT "${callback-api-tracing-env}" - FAIL_REGULAR_EXPRESSION "threw an exception") + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") diff --git a/projects/rocprofiler-sdk/samples/api_callback_tracing/main.cpp b/projects/rocprofiler-sdk/samples/api_callback_tracing/main.cpp index 99d385263c..9751463298 100644 --- a/projects/rocprofiler-sdk/samples/api_callback_tracing/main.cpp +++ b/projects/rocprofiler-sdk/samples/api_callback_tracing/main.cpp @@ -177,7 +177,7 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv) if(argc > 3) nsync = atoll(argv[3]); auto_lock_t _lk{print_lock}; - std::cout << "[" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl; + std::cout << "[transpose][" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl; _lk.unlock(); auto _seed = std::random_device{}() * (rank + 1) * (tid + 1); @@ -219,8 +219,10 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv) float GB = (float) size * nitr * 2 / (1 << 30); print_lock.lock(); - std::cout << "[" << rank << "][" << tid << "] Runtime of transpose is " << time << " sec\n" - << "The average performance of transpose is " << GB / time << " GBytes/sec" + std::cout << "[transpose][" << rank << "][" << tid << "] Runtime of transpose is " << time + << " sec\n"; + std::cout << "[transpose][" << rank << "][" << tid + << "] The average performance of transpose is " << GB / time << " GBytes/sec" << std::endl; print_lock.unlock(); diff --git a/projects/rocprofiler-sdk/samples/code_object_tracing/CMakeLists.txt b/projects/rocprofiler-sdk/samples/code_object_tracing/CMakeLists.txt index 5561feee93..2f41af72b1 100644 --- a/projects/rocprofiler-sdk/samples/code_object_tracing/CMakeLists.txt +++ b/projects/rocprofiler-sdk/samples/code_object_tracing/CMakeLists.txt @@ -55,4 +55,4 @@ set_tests_properties( ENVIRONMENT "${ROCPROFILER_MEMCHECK_PRELOAD_ENV};HSA_TOOLS_LIB=$" FAIL_REGULAR_EXPRESSION - "threw an exception") + "${ROCPROFILER_DEFAULT_FAIL_REGEX}") diff --git a/projects/rocprofiler-sdk/samples/code_object_tracing/main.cpp b/projects/rocprofiler-sdk/samples/code_object_tracing/main.cpp index 3687c999a9..124ec5c149 100644 --- a/projects/rocprofiler-sdk/samples/code_object_tracing/main.cpp +++ b/projects/rocprofiler-sdk/samples/code_object_tracing/main.cpp @@ -145,7 +145,7 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv) if(argc > 3) nsync = atoll(argv[3]); auto_lock_t _lk{print_lock}; - std::cout << "[" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl; + std::cout << "[transpose][" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl; _lk.unlock(); std::default_random_engine _engine{std::random_device{}() * (rank + 1) * (tid + 1)}; @@ -173,7 +173,7 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv) dim3 block(32, 32, 1); // transpose_a print_lock.lock(); - printf("[%i][%i] grid=(%i,%i,%i), block=(%i,%i,%i)\n", + printf("[transpose][%i][%i] grid=(%i,%i,%i), block=(%i,%i,%i)\n", rank, tid, grid.x, @@ -198,8 +198,10 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv) float GB = (float) size * nitr * 2 / (1 << 30); print_lock.lock(); - std::cout << "[" << rank << "][" << tid << "] Runtime of transpose is " << time << " sec\n" - << "The average performance of transpose is " << GB / time << " GBytes/sec" + std::cout << "[transpose][" << rank << "][" << tid << "] Runtime of transpose is " << time + << " sec\n"; + std::cout << "[transpose][" << rank << "][" << tid + << "] The average performance of transpose is " << GB / time << " GBytes/sec" << std::endl; print_lock.unlock(); diff --git a/projects/rocprofiler-sdk/samples/common/CMakeLists.txt b/projects/rocprofiler-sdk/samples/common/CMakeLists.txt index 8369b6daab..edae86f2e6 100644 --- a/projects/rocprofiler-sdk/samples/common/CMakeLists.txt +++ b/projects/rocprofiler-sdk/samples/common/CMakeLists.txt @@ -2,6 +2,11 @@ # common utilities for samples # +# default FAIL_REGULAR_EXPRESSION for tests +set(ROCPROFILER_DEFAULT_FAIL_REGEX + "threw an exception|Permission denied|Could not create logging file" + CACHE STRING "Default FAIL_REGULAR_EXPRESSION for tests") + # build flags add_library(rocprofiler-samples-build-flags INTERFACE) add_library(rocprofiler::samples-build-flags ALIAS rocprofiler-samples-build-flags) diff --git a/projects/rocprofiler-sdk/samples/counter_collection/CMakeLists.txt b/projects/rocprofiler-sdk/samples/counter_collection/CMakeLists.txt index 45f3909bb3..91a9053ebe 100644 --- a/projects/rocprofiler-sdk/samples/counter_collection/CMakeLists.txt +++ b/projects/rocprofiler-sdk/samples/counter_collection/CMakeLists.txt @@ -52,7 +52,7 @@ set_tests_properties( ENVIRONMENT "${ROCPROFILER_MEMCHECK_PRELOAD_ENV};HSA_TOOLS_LIB=$" FAIL_REGULAR_EXPRESSION - "threw an exception") + "${ROCPROFILER_DEFAULT_FAIL_REGEX}") add_library(counter-collection-callback-client SHARED) target_sources(counter-collection-callback-client PRIVATE callback_client.cpp client.hpp) @@ -80,7 +80,7 @@ set_tests_properties( ENVIRONMENT "${ROCPROFILER_MEMCHECK_PRELOAD_ENV};HSA_TOOLS_LIB=$" FAIL_REGULAR_EXPRESSION - "threw an exception") + "${ROCPROFILER_DEFAULT_FAIL_REGEX}") add_library(counter-collection-functional-counter-client SHARED) target_sources(counter-collection-functional-counter-client @@ -109,4 +109,4 @@ set_tests_properties( ENVIRONMENT "${ROCPROFILER_MEMCHECK_PRELOAD_ENV};HSA_TOOLS_LIB=$" FAIL_REGULAR_EXPRESSION - "threw an exception") + "${ROCPROFILER_DEFAULT_FAIL_REGEX}") diff --git a/projects/rocprofiler-sdk/samples/counter_collection/callback_client.cpp b/projects/rocprofiler-sdk/samples/counter_collection/callback_client.cpp index 82ad66c547..e86d90ac4d 100644 --- a/projects/rocprofiler-sdk/samples/counter_collection/callback_client.cpp +++ b/projects/rocprofiler-sdk/samples/counter_collection/callback_client.cpp @@ -67,15 +67,14 @@ get_client_ctx() } void -record_callback(rocprofiler_queue_id_t, - rocprofiler_agent_id_t, - rocprofiler_correlation_id_t, - uint64_t, - void* callback_data_args, - size_t record_count, - rocprofiler_record_counter_t* record_data) +record_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data, + rocprofiler_record_counter_t* record_data, + size_t record_count, + rocprofiler_user_data_t user_data, + void* callback_data_args) { std::stringstream ss; + ss << "Kernel_id " << dispatch_data.kernel_id << ": "; for(size_t i = 0; i < record_count; ++i) { ss << "(Id: " << record_data[i].id << " Value [D]: " << record_data[i].counter_value @@ -84,6 +83,8 @@ record_callback(rocprofiler_queue_id_t, auto* output_stream = static_cast(callback_data_args); if(!output_stream) throw std::runtime_error{"nullptr to output stream"}; *output_stream << "[" << __FUNCTION__ << "] " << ss.str() << "\n"; + + (void) user_data; } /** @@ -93,13 +94,10 @@ record_callback(rocprofiler_queue_id_t, * to collect the counter SQ_WAVES for all kernel dispatch packets. */ void -dispatch_callback(rocprofiler_queue_id_t /*queue_id*/, - const rocprofiler_agent_t* agent, - rocprofiler_correlation_id_t /*correlation_id*/, - const hsa_kernel_dispatch_packet_t* /*dispatch_packet*/, - uint64_t /*kernel_id*/, - void* /*callback_data_args*/, - rocprofiler_profile_config_id_t* config) +dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data, + rocprofiler_profile_config_id_t* config, + rocprofiler_user_data_t* /*user_data*/, + void* /*callback_data_args*/) { /** * This simple example uses the same profile counter set for all agents. @@ -112,7 +110,7 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/, static std::unordered_map profile_cache = {}; auto search_cache = [&]() { - if(auto pos = profile_cache.find(agent->id.handle); pos != profile_cache.end()) + if(auto pos = profile_cache.find(dispatch_data.agent_id.handle); pos != profile_cache.end()) { *config = pos->second; return true; @@ -135,7 +133,7 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/, // Iterate through the agents and get the counters available on that agent ROCPROFILER_CALL(rocprofiler_iterate_agent_supported_counters( - agent->id, + dispatch_data.agent_id, [](rocprofiler_agent_id_t, rocprofiler_counter_id_t* counters, size_t num_counters, @@ -169,11 +167,12 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/, // Create a colleciton profile for the counters rocprofiler_profile_config_id_t profile; - ROCPROFILER_CALL(rocprofiler_create_profile_config( - agent->id, collect_counters.data(), collect_counters.size(), &profile), - "Could not construct profile cfg"); + ROCPROFILER_CALL( + rocprofiler_create_profile_config( + dispatch_data.agent_id, collect_counters.data(), collect_counters.size(), &profile), + "Could not construct profile cfg"); - profile_cache.emplace(agent->id.handle, profile); + profile_cache.emplace(dispatch_data.agent_id.handle, profile); // Return the profile to collect those counters for this dispatch *config = profile; } diff --git a/projects/rocprofiler-sdk/samples/counter_collection/client.cpp b/projects/rocprofiler-sdk/samples/counter_collection/client.cpp index 8e82640887..970c295a1f 100644 --- a/projects/rocprofiler-sdk/samples/counter_collection/client.cpp +++ b/projects/rocprofiler-sdk/samples/counter_collection/client.cpp @@ -32,6 +32,7 @@ #include #include +#include #include #include @@ -116,13 +117,10 @@ buffered_callback(rocprofiler_context_id_t, * to collect the counter SQ_WAVES for all kernel dispatch packets. */ void -dispatch_callback(rocprofiler_queue_id_t /*queue_id*/, - const rocprofiler_agent_t* agent, - rocprofiler_correlation_id_t /*correlation_id*/, - const hsa_kernel_dispatch_packet_t* /*dispatch_packet*/, - uint64_t /*kernel_id*/, - void* /*callback_data_args*/, - rocprofiler_profile_config_id_t* config) +dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data, + rocprofiler_profile_config_id_t* config, + rocprofiler_user_data_t* /*user_data*/, + void* /*callback_data_args*/) { /** * This simple example uses the same profile counter set for all agents. @@ -135,7 +133,7 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/, static std::unordered_map profile_cache = {}; auto search_cache = [&]() { - if(auto pos = profile_cache.find(agent->id.handle); pos != profile_cache.end()) + if(auto pos = profile_cache.find(dispatch_data.agent_id.handle); pos != profile_cache.end()) { *config = pos->second; return true; @@ -158,7 +156,7 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/, // Iterate through the agents and get the counters available on that agent ROCPROFILER_CALL(rocprofiler_iterate_agent_supported_counters( - agent->id, + dispatch_data.agent_id, [](rocprofiler_agent_id_t, rocprofiler_counter_id_t* counters, size_t num_counters, @@ -192,11 +190,12 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/, // Create a colleciton profile for the counters rocprofiler_profile_config_id_t profile; - ROCPROFILER_CALL(rocprofiler_create_profile_config( - agent->id, collect_counters.data(), collect_counters.size(), &profile), - "Could not construct profile cfg"); + ROCPROFILER_CALL( + rocprofiler_create_profile_config( + dispatch_data.agent_id, collect_counters.data(), collect_counters.size(), &profile), + "Could not construct profile cfg"); - profile_cache.emplace(agent->id.handle, profile); + profile_cache.emplace(dispatch_data.agent_id.handle, profile); // Return the profile to collect those counters for this dispatch *config = profile; } diff --git a/projects/rocprofiler-sdk/samples/counter_collection/print_functional_counters.cpp b/projects/rocprofiler-sdk/samples/counter_collection/print_functional_counters.cpp index 244177f304..872e53dadb 100644 --- a/projects/rocprofiler-sdk/samples/counter_collection/print_functional_counters.cpp +++ b/projects/rocprofiler-sdk/samples/counter_collection/print_functional_counters.cpp @@ -189,15 +189,46 @@ buffered_callback(rocprofiler_context_id_t, } } -void -dispatch_callback(rocprofiler_queue_id_t /*queue_id*/, - const rocprofiler_agent_t* agent, - rocprofiler_correlation_id_t /*correlation_id*/, - const hsa_kernel_dispatch_packet_t* /*dispatch_packet*/, - uint64_t /*kernel_id*/, - void* /*callback_data_args*/, - rocprofiler_profile_config_id_t* config) +using agent_map_t = std::map; + +agent_map_t +get_agent_info() { + auto iterate_cb = [](rocprofiler_agent_version_t agents_ver, + const void** agents_arr, + size_t num_agents, + void* user_data) { + if(agents_ver != ROCPROFILER_AGENT_INFO_VERSION_0) + throw std::runtime_error{"unexpected rocprofiler agent version"}; + + auto* agents_v = static_cast(user_data); + for(size_t i = 0; i < num_agents; ++i) + { + const auto* itr = static_cast(agents_arr[i]); + agents_v->emplace(itr->id.handle, itr); + } + return ROCPROFILER_STATUS_SUCCESS; + }; + + auto _agents = agent_map_t{}; + ROCPROFILER_CALL( + rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0, + iterate_cb, + sizeof(rocprofiler_agent_t), + const_cast(static_cast(&_agents))), + "query available agents"); + + return _agents; +} + +void +dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data, + rocprofiler_profile_config_id_t* config, + rocprofiler_user_data_t* /*user_data*/, + void* /*callback_data_args*/) +{ + static auto agents = get_agent_info(); + auto& cap = *get_capture(); auto wlock = std::unique_lock{cap.m_mutex}; @@ -211,7 +242,7 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/, { std::vector counters_needed; ROCPROFILER_CALL(rocprofiler_iterate_agent_supported_counters( - agent->id, + dispatch_data.agent_id, [](rocprofiler_agent_id_t, rocprofiler_counter_id_t* counters, size_t num_counters, @@ -237,9 +268,9 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/, "Could not query counter_id"); cap.expected_counter_names.emplace(found_counter.handle, std::string(version.name)); size_t expected = 0; - ROCPROFILER_CALL( - rocprofiler_query_counter_instance_count(agent->id, found_counter, &expected), - "COULD NOT QUERY INSTANCES"); + ROCPROFILER_CALL(rocprofiler_query_counter_instance_count( + dispatch_data.agent_id, found_counter, &expected), + "COULD NOT QUERY INSTANCES"); cap.remaining.push_back(found_counter); cap.expected.emplace(found_counter.handle, expected); @@ -266,7 +297,8 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/, } if(cap.expected.empty()) { - std::clog << "No counters found for agent - " << agent->name; + std::clog << "No counters found for agent " << dispatch_data.agent_id.handle << " (" + << agents.at(dispatch_data.agent_id.handle)->name << ")"; } } if(cap.remaining.empty()) return; @@ -274,9 +306,9 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/, rocprofiler_profile_config_id_t profile; // Select the next counter to collect. - ROCPROFILER_CALL( - rocprofiler_create_profile_config(agent->id, &(cap.remaining.back()), 1, &profile), - "Could not construct profile cfg"); + ROCPROFILER_CALL(rocprofiler_create_profile_config( + dispatch_data.agent_id, &(cap.remaining.back()), 1, &profile), + "Could not construct profile cfg"); cap.remaining.pop_back(); *config = profile; diff --git a/projects/rocprofiler-sdk/samples/intercept_table/CMakeLists.txt b/projects/rocprofiler-sdk/samples/intercept_table/CMakeLists.txt index e4dd185672..4289ad56c0 100644 --- a/projects/rocprofiler-sdk/samples/intercept_table/CMakeLists.txt +++ b/projects/rocprofiler-sdk/samples/intercept_table/CMakeLists.txt @@ -54,4 +54,4 @@ set_tests_properties( ENVIRONMENT "${ROCPROFILER_MEMCHECK_PRELOAD_ENV};HSA_TOOLS_LIB=$" FAIL_REGULAR_EXPRESSION - "threw an exception") + "${ROCPROFILER_DEFAULT_FAIL_REGEX}") diff --git a/projects/rocprofiler-sdk/samples/intercept_table/main.cpp b/projects/rocprofiler-sdk/samples/intercept_table/main.cpp index fc99283091..be8240b98a 100644 --- a/projects/rocprofiler-sdk/samples/intercept_table/main.cpp +++ b/projects/rocprofiler-sdk/samples/intercept_table/main.cpp @@ -145,7 +145,7 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv) if(argc > 3) nsync = atoll(argv[3]); auto_lock_t _lk{print_lock}; - std::cout << "[" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl; + std::cout << "[transpose][" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl; _lk.unlock(); std::default_random_engine _engine{std::random_device{}() * (rank + 1) * (tid + 1)}; diff --git a/projects/rocprofiler-sdk/source/bin/rocprofv3 b/projects/rocprofiler-sdk/source/bin/rocprofv3 index 6e36dfc0ee..fadf631d22 100755 --- a/projects/rocprofiler-sdk/source/bin/rocprofv3 +++ b/projects/rocprofiler-sdk/source/bin/rocprofv3 @@ -5,9 +5,16 @@ set -eo pipefail ROCPROFV3_DIR=$(dirname -- "$(realpath "${BASH_SOURCE[0]}")") ROCM_DIR=$(dirname -- "${ROCPROFV3_DIR}") -: ${HSA_TOOLS_LIB:="${ROCM_DIR}/lib/librocprofiler-sdk.so"} +: ${ROCPROFILER_LIBRARY_CTOR:=1} +: ${ROCPROF_OUTPUT_PATH:="."} +: ${ROCPROF_OUTPUT_PATH_INTERNAL:="."} +: ${ROCPROF_OUTPUT_FILE_NAME:=""} +: ${ROCPROF_COUNTERS_PATH:=""} +: ${ROCPROF_PRELOAD:=""} +: ${ROCPROF_TOOL_LIBRARY:="${ROCM_DIR}/lib/rocprofiler-sdk/librocprofiler-sdk-tool.so"} +: ${ROCPROF_SDK_LIBRARY:="${ROCM_DIR}/lib/librocprofiler-sdk.so"} -export HSA_TOOLS_LIB +export ROCPROFILER_LIBRARY_CTOR # Define color codes GREEN='\033[0;32m' @@ -19,27 +26,40 @@ usage() { if [ -z "${EC}" ]; then EC=1; fi echo -e "${RESET}ROCProfilerV3 Run Script Usage:" echo -e "${GREEN}-h | --help ${RESET} For showing this message" + echo -e "" + echo -e "${GREEN}--hip-trace ${RESET} For Collecting HIP Traces (runtime + compiler)" + echo -e "${GREEN}--hip-runtime-trace ${RESET} For Collecting HIP Runtime API Traces" + echo -e "${GREEN}--hip-compiler-trace ${RESET} For Collecting HIP Compiler generated code Traces" + echo -e "" + echo -e "${GREEN}--marker-trace ${RESET} For Collecting Marker (ROCTx) Traces" + echo -e "${GREEN}--kernel-trace ${RESET} For Collecting Kernel Dispatch Traces" + echo -e "${GREEN}--memory-copy-trace ${RESET} For Collecting Memory Copy Traces" + echo -e "" + echo -e "${GREEN}--hsa-trace ${RESET} For Collecting HSA API Traces (core + amd + image + finalizer)" + echo -e "${GREEN}--hsa-core-trace ${RESET} For Collecting HSA API Traces (core API)" + echo -e "${GREEN}--hsa-amd-trace ${RESET} For Collecting HSA API Traces (AMD-extension API)" + echo -e "${GREEN}--hsa-image-trace ${RESET} For Collecting HSA API Traces (Image-extenson API)" + echo -e "${GREEN}--hsa-finalizer-trace ${RESET} For Collecting HSA API Traces (Finalizer-extension API)" + echo -e "" + echo -e "${GREEN}--sys-trace ${RESET} For Collecting HIP,HSA, Memory Copy, (marker)ROCTx and Kernel dispatch traces\n" + echo -e "" + echo -e "${GREEN}-o | --output-file ${RESET} For the output file name" + echo -e "\t#${GREY} usage (with current dir): rocprofv3 --hsa-trace -o " + echo -e "\t#${GREY} usage (with custom dir): rocprofv3 --hsa-trace -d -o ${RESET}\n" + echo -e "" + echo -e "${GREEN}-d | --output-directory ${RESET} For adding output path where the output files will be saved" + echo -e "\t#${GREY} usage (with custom dir): rocprofv3 --hsa-trace -d ${RESET}" + echo -e "" + echo -e "${GREEN}-M | --mangled-kernels ${RESET} Do not demangle the kernel names" + echo -e "${GREEN}-T | --truncate-kernels ${RESET} Truncate the demangled kernel names" + echo -e "" + echo -e "${GREEN}-L | --list-metrics ${RESET} List metrics for counter collection" echo -e "${GREEN}-i | --input ${RESET} For counter collection " echo -e "\t#${GREY} Input file .txt format, automatically rerun application for every profiling features line" echo -e "\t# Perf counters group 1" echo -e "\tpmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts" echo -e "\t# Perf counters group 2" echo -e "\tpmc : WriteSize L2CacheHit ${RESET}" - echo -e "${GREEN}--hsa-trace ${RESET} For Collecting HSA API Traces" - echo -e "${GREEN}--kernel-trace ${RESET} For Collecting Kernel Dispatch Traces" - echo -e "${GREEN}--memory-copy-trace ${RESET} For Collecting Memory Copy Traces" - echo -e "${GREEN}--marker-trace ${RESET} For Collecting Marker (ROCTx) Traces" - echo -e "${GREEN}--hip-trace ${RESET} For Collecting HIP Runtime Traces" - echo -e "${GREEN}--hip-compiler-trace ${RESET} For Collecting HIP Compiler generated code Traces" - echo -e "${GREEN}--sys-trace ${RESET} For Collecting HIP,HSA, Memory Copy, (marker)ROCTx and Kernel dispatch traces\n" - echo -e "${GREEN}-o | --output-file ${RESET} For the output file name" - echo -e "\t#${GREY} usage e.g:(with current dir): rocprofv3 --hsa-trace -o " - echo -e "\t#${GREY} usage e.g:(with custom dir): rocprofv3 --hsa-trace -d -o ${RESET}\n" - echo -e "${GREEN}-d | --output-directory ${RESET} For adding output path where the output files will be saved" - echo -e "\t#${GREY} usage e.g:(with custom dir): rocprofv3 --hsa-trace -d ${RESET}" - echo -e "${GREEN}-M | --mangled-kernels ${RESET} Do not demangle the kernel names" - echo -e "${GREEN}-T | --truncate-kernels ${RESET} Truncate the demangled kernel names" - echo -e "${GREEN}-L | --list-metrics ${RESET} List metrics" echo -e "" exit ${EC} } @@ -48,11 +68,22 @@ if [ -z "$1" ]; then usage 1 fi -: ${ROCPROF_OUTPUT_PATH:="."} -: ${ROCPROF_OUTPUT_PATH_INTERNAL:="."} -: ${ROCPROF_OUTPUT_FILE_NAME:=""} -: ${ROCPROF_COUNTERS_PATH:=""} -: ${ROCPROF_PRELOAD:=""} +if [ -n "${ROCPROF_PRELOAD}" ]; then + ROCPROF_PRELOAD="${ROCPROF_PRELOAD}:${ROCPROF_TOOL_LIBRARY}:${ROCPROF_SDK_LIBRARY}" +else + ROCPROF_PRELOAD="${ROCPROF_TOOL_LIBRARY}:${ROCPROF_SDK_LIBRARY}" +fi + +if [ -n "${ROCP_TOOL_LIBRARIES}" ]; then + ROCP_TOOL_LIBRARIES="${ROCP_TOOL_LIBRARIES}:${ROCPROF_TOOL_LIBRARY}" +else + ROCP_TOOL_LIBRARIES="${ROCPROF_TOOL_LIBRARY}" +fi + +LD_LIBRARY_PATH=${ROCM_DIR}/lib:${LD_LIBRARY_PATH} + +export ROCP_TOOL_LIBRARIES +export LD_LIBRARY_PATH while true; do if [[ "$1" == "-h" || "$1" == "--help" ]]; then @@ -93,14 +124,26 @@ while true; do shift shift elif [ "$1" == "--hsa-trace" ]; then - export ROCPROF_HSA_API_TRACE=1 + export ROCPROF_HSA_CORE_API_TRACE=1 + export ROCPROF_HSA_AMD_EXT_API_TRACE=1 + export ROCPROF_HSA_IMAGE_EXT_API_TRACE=1 + export ROCPROF_HSA_FINALIZER_EXT_API_TRACE=1 + shift + elif [ "$1" == "--hsa-core-trace" ]; then + export ROCPROF_HSA_CORE_API_TRACE=1 + shift + elif [ "$1" == "--hsa-amd-trace" ]; then + export ROCPROF_HSA_AMD_EXT_API_TRACE=1 + shift + elif [ "$1" == "--hsa-image-trace" ]; then + export ROCPROF_HSA_IMAGE_EXT_API_TRACE=1 + shift + elif [ "$1" == "--hsa-finalizer-trace" ]; then + export ROCPROF_HSA_FINALIZER_EXT_API_TRACE=1 shift elif [[ "$1" == "-L" || "$1" == "--list-metrics" ]]; then export ROCPROF_LIST_METRICS=1 - ROCP_TOOL_LIBRARIES="${ROCM_DIR}/lib/rocprofiler-sdk/librocprofiler-sdk-tool.so" \ - LD_LIBRARY_PATH=${ROCM_DIR}/lib:${LD_LIBRARY_PATH} \ - LD_PRELOAD="${ROCPROF_PRELOAD}:${ROCM_DIR}/lib/librocprofiler-sdk.so" \ - exec ${ROCM_DIR}/lib/rocprofiler-sdk/rocprofv3-trigger-list-metrics + LD_PRELOAD="${ROCPROF_PRELOAD}" exec ${ROCM_DIR}/lib/rocprofiler-sdk/rocprofv3-trigger-list-metrics elif [ "$1" == "--kernel-trace" ]; then export ROCPROF_KERNEL_TRACE=1 shift @@ -111,17 +154,25 @@ while true; do export ROCPROF_MARKER_API_TRACE=1 shift elif [ "$1" == "--hip-trace" ]; then - export ROCPROF_HIP_API_TRACE=1 + export ROCPROF_HIP_RUNTIME_API_TRACE=1 + export ROCPROF_HIP_COMPILER_API_TRACE=1 + shift + elif [ "$1" == "--hip-runtime-trace" ]; then + export ROCPROF_HIP_RUNTIME_API_TRACE=1 shift elif [ "$1" == "--hip-compiler-trace" ]; then export ROCPROF_HIP_COMPILER_API_TRACE=1 shift elif [ "$1" == "--sys-trace" ]; then - export ROCPROF_HSA_API_TRACE=1 + export ROCPROF_HSA_CORE_API_TRACE=1 + export ROCPROF_HSA_AMD_EXT_API_TRACE=1 + export ROCPROF_HSA_IMAGE_EXT_API_TRACE=1 + export ROCPROF_HSA_FINALIZER_EXT_API_TRACE=1 + export ROCPROF_HIP_RUNTIME_API_TRACE=1 + export ROCPROF_HIP_COMPILER_API_TRACE=1 + export ROCPROF_MARKER_API_TRACE=1 export ROCPROF_KERNEL_TRACE=1 export ROCPROF_MEMORY_COPY_TRACE=1 - export ROCPROF_MARKER_API_TRACE=1 - export ROCPROF_HIP_API_TRACE=1 shift elif [ "$1" == "--" ]; then shift @@ -161,18 +212,12 @@ if [ -n "${PMC_LINES:-}" ]; then export ROCPROF_OUTPUT_PATH=$RESULT_PATH fi ((COUNTER++)) - ROCP_TOOL_LIBRARIES="${ROCM_DIR}/lib/rocprofiler-sdk/librocprofiler-sdk-tool.so" \ - LD_LIBRARY_PATH=${ROCM_DIR}/lib:${LD_LIBRARY_PATH} \ - LD_PRELOAD="${ROCPROF_PRELOAD}:${ROCM_DIR}/lib/librocprofiler-sdk.so" \ - "${@}" + LD_PRELOAD="${ROCPROF_PRELOAD}" "${@}" if [ -n "$ROCPROF_OUTPUT_PATH" ]; then echo -e "\nThe output path for the following counters: $ROCPROF_OUTPUT_PATH" fi done else # for non counter collection. e.g: tracing - ROCP_TOOL_LIBRARIES="${ROCM_DIR}/lib/rocprofiler-sdk/librocprofiler-sdk-tool.so" \ - LD_LIBRARY_PATH=${ROCM_DIR}/lib:${LD_LIBRARY_PATH} \ - LD_PRELOAD="${ROCPROF_PRELOAD}:${ROCM_DIR}/lib/librocprofiler-sdk.so" \ - "${@}" + LD_PRELOAD="${ROCPROF_PRELOAD}" exec "${@}" fi diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/callback_tracing.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/callback_tracing.h index eb20ab0a9b..a524740c5a 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/callback_tracing.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/callback_tracing.h @@ -30,6 +30,8 @@ #include +#include + ROCPROFILER_EXTERN_C_INIT /** @@ -141,9 +143,14 @@ typedef struct uint32_t kernarg_segment_alignment; ///< Alignment (in bytes) of the buffer used to pass ///< arguments to the kernel uint32_t group_segment_size; ///< Size of static group segment memory required by the kernel - ///< (per work-group), in bytes + ///< (per work-group), in bytes. AKA: LDS size uint32_t private_segment_size; ///< Size of static private, spill, and arg segment memory - ///< required by this kernel (per work-item), in bytes. + ///< required by this kernel (per work-item), in bytes. AKA: + ///< scratch size + uint32_t sgpr_count; ///< Scalar general purpose register count + uint32_t arch_vgpr_count; ///< Architecture vector general purpose register count + uint32_t accum_vgpr_count; ///< Accum vector general purpose register count + } rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; /** diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/dispatch_profile.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/dispatch_profile.h index 39aac1b8a4..c85dac8966 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/dispatch_profile.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/dispatch_profile.h @@ -37,6 +37,23 @@ ROCPROFILER_EXTERN_C_INIT * @{ */ +/** + * @brief Kernel dispatch data for profile counting callbacks + * + */ +typedef struct rocprofiler_profile_counting_dispatch_data_t +{ + uint64_t size; ///< Size of this struct + rocprofiler_kernel_id_t kernel_id; ///< Kernel identifier + rocprofiler_agent_id_t agent_id; ///< Agent ID where kernel is launched + rocprofiler_queue_id_t queue_id; ///< Queue ID where kernel packet is enqueued + rocprofiler_correlation_id_t correlation_id; ///< Correlation ID for this dispatch + uint32_t private_segment_size; /// runtime private memory segment size + uint32_t group_segment_size; /// runtime group memory segment size + rocprofiler_dim3_t workgroup_size; /// runtime workgroup size (grid * threads) + rocprofiler_dim3_t grid_size; /// runtime grid size +} rocprofiler_profile_counting_dispatch_data_t; + /** * @brief Kernel Dispatch Callback. This is a callback that is invoked before the kernel * is enqueued into the HSA queue. What counters to collect for a kernel are set @@ -44,22 +61,35 @@ ROCPROFILER_EXTERN_C_INIT * will be collected and emplaced in the buffer with @ref rocprofiler_buffer_id_t used when * setting up this callback. * - * @param [in] queue_id Queue the kernel dispatch packet is being enqueued onto - * @param [in] agent Agent of this queue - * @param [in] correlation_id Correlation ID for this dispatch - * @param [in] dispatch_packet Kernel dispatch packet about to be enqueued into HSA - * @param [in] kernel_id Kernel identifier + * @param [in] dispatch_data @see ::rocprofiler_profile_counting_dispatch_data_t + * @param [out] config Profile config detailing the counters to collect for this kernel + * @param [out] user_data User data unique to this dispatch. Returned in record callback * @param [in] callback_data_args Callback supplied via buffered_dispatch_profile_counting_service - * @param [out] config Profile config detailing the counters to collect for this kernel */ typedef void (*rocprofiler_profile_counting_dispatch_callback_t)( - rocprofiler_queue_id_t queue_id, - const rocprofiler_agent_t* agent, - rocprofiler_correlation_id_t correlation_id, - const hsa_kernel_dispatch_packet_t* dispatch_packet, - uint64_t kernel_id, - void* callback_data_args, - rocprofiler_profile_config_id_t* config); + rocprofiler_profile_counting_dispatch_data_t dispatch_data, + rocprofiler_profile_config_id_t* config, + rocprofiler_user_data_t* user_data, + void* callback_data_args); + +/** + * @brief Counting record callback. This is a callback is invoked when the kernel + * execution is complete and contains the counter profile data requested in + * @ref rocprofiler_profile_counting_dispatch_callback_t. Only used with + * @ref rocprofiler_configure_callback_dispatch_profile_counting_service. + * + * @param [in] dispatch_data @see ::rocprofiler_profile_counting_dispatch_data_t + * @param [in] record_data Counter record data. + * @param [in] record_count Number of counter records. + * @param [in] user_data User data instance from dispatch callback + * @param [in] callback_data_args Callback supplied via buffered_dispatch_profile_counting_service + */ +typedef void (*rocprofiler_profile_counting_record_callback_t)( + rocprofiler_profile_counting_dispatch_data_t dispatch_data, + rocprofiler_record_counter_t* record_data, + size_t record_count, + rocprofiler_user_data_t user_data, + void* callback_data_args); /** * @brief Configure buffered dispatch profile Counting Service. @@ -95,29 +125,6 @@ rocprofiler_configure_buffered_dispatch_profile_counting_service( rocprofiler_profile_counting_dispatch_callback_t callback, void* callback_data_args); -/** - * @brief Counting record callback. This is a callback is invoked when the kernel - * execution is complete and contains the counter profile data requested in - * @ref rocprofiler_profile_counting_dispatch_callback_t. Only used with - * @ref rocprofiler_configure_callback_dispatch_profile_counting_service. - * - * @param [in] queue_id Queue the kernel dispatch packet is being enqueued onto - * @param [in] agent Agent of this queue - * @param [in] correlation_id Correlation ID for this dispatch - * @param [in] kernel_id Kernel identifier - * @param [in] callback_data_args Callback supplied via buffered_dispatch_profile_counting_service - * @param [in] record_count Number of counter records. - * @param [in] record_data Counter record data. - */ -typedef void (*rocprofiler_profile_counting_record_callback_t)( - rocprofiler_queue_id_t queue_id, - rocprofiler_agent_id_t agent, - rocprofiler_correlation_id_t correlation_id, - uint64_t kernel_id, - void* callback_data_args, - size_t record_count, - rocprofiler_record_counter_t* record_data); - /** * @brief Configure buffered dispatch profile Counting Service. * Collects the counters in dispatch packets and calls a callback diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h index 7e5a60d3a0..e2e5fc80bb 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h @@ -261,26 +261,16 @@ typedef enum ROCPROFILER_TABLE_LAST = ROCPROFILER_MARKER_NAME_TABLE, } rocprofiler_intercept_table_t; -/** - * @brief Enumeration for specifying the data type contained within the union. - */ -typedef enum -{ - ROCPROFILER_UNION_TYPE_NONE = 0, ///< No union type - ROCPROFILER_UNION_TYPE_STRING, ///< String Type set - ROCPROFILER_UNION_TYPE_INT, ///< Integer Type Set - ROCPROFILER_UNION_TYPE_LAST, -} rocprofiler_union_type_t; - /** * @brief Enumeration for specifying the counter info struct version you want. */ typedef enum { ROCPROFILER_COUNTER_INFO_VERSION_NONE, - ROCPROFILER_COUNTER_INFO_VERSION_0, ///< @see rocprofiler_counter_info_v0_t + ROCPROFILER_COUNTER_INFO_VERSION_0, ///< @see ::rocprofiler_counter_info_v0_t ROCPROFILER_COUNTER_INFO_VERSION_LAST, } rocprofiler_counter_info_version_id_t; + //--------------------------------------------------------------------------------------// // // ALIASES diff --git a/projects/rocprofiler-sdk/source/lib/common/environment.cpp b/projects/rocprofiler-sdk/source/lib/common/environment.cpp index 929b6a2819..9065309392 100644 --- a/projects/rocprofiler-sdk/source/lib/common/environment.cpp +++ b/projects/rocprofiler-sdk/source/lib/common/environment.cpp @@ -112,9 +112,27 @@ get_env(std::string_view env_id, Tp _default, std::enable_if_t +int +set_env(std::string_view env_id, Tp value, int override) +{ + auto str_value = std::stringstream{}; + str_value << value; + return ::setenv(env_id.data(), str_value.str().c_str(), override); +} + #define SPECIALIZE_GET_ENV(TYPE) \ template TYPE get_env( \ - std::string_view, TYPE, std::enable_if_t::value, sfinae>); + std::string_view, TYPE, std::enable_if_t::value, sfinae>); \ + template int set_env(std::string_view, TYPE, int); + +#define SPECIALIZE_SET_ENV(TYPE) template int set_env(std::string_view, TYPE, int); SPECIALIZE_GET_ENV(int8_t) SPECIALIZE_GET_ENV(int16_t) @@ -124,6 +142,11 @@ SPECIALIZE_GET_ENV(uint8_t) SPECIALIZE_GET_ENV(uint16_t) SPECIALIZE_GET_ENV(uint32_t) SPECIALIZE_GET_ENV(uint64_t) + +SPECIALIZE_SET_ENV(const char*) +SPECIALIZE_SET_ENV(std::string) +SPECIALIZE_SET_ENV(float) +SPECIALIZE_SET_ENV(double) } // namespace impl } // namespace common } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/common/environment.hpp b/projects/rocprofiler-sdk/source/lib/common/environment.hpp index d6ad9a4814..12fa7bc925 100644 --- a/projects/rocprofiler-sdk/source/lib/common/environment.hpp +++ b/projects/rocprofiler-sdk/source/lib/common/environment.hpp @@ -48,6 +48,13 @@ get_env(std::string_view, bool); template Tp get_env(std::string_view, Tp, std::enable_if_t::value, sfinae> = {}); + +int +set_env(std::string_view, bool, int override = 0); + +template +int +set_env(std::string_view, Tp, int override = 0); } // namespace impl template @@ -66,6 +73,13 @@ get_env(std::string_view env_id, Tp&& _default) } } +template +inline auto +set_env(std::string_view env_id, Tp&& value, int override = 0) +{ + return impl::set_env(env_id, std::forward(value), override); +} + struct env_config { std::string env_name = {}; diff --git a/projects/rocprofiler-sdk/source/lib/common/logging.cpp b/projects/rocprofiler-sdk/source/lib/common/logging.cpp index 3d4938ef8c..1160372888 100644 --- a/projects/rocprofiler-sdk/source/lib/common/logging.cpp +++ b/projects/rocprofiler-sdk/source/lib/common/logging.cpp @@ -28,17 +28,28 @@ #include #include +#include #include namespace rocprofiler { namespace common { +namespace +{ void -init_logging(std::string_view env_var) +install_failure_signal_handler() { static auto _once = std::once_flag{}; - std::call_once(_once, [env_var]() { + std::call_once(_once, []() { google::InstallFailureSignalHandler(); }); +} +} // namespace + +void +init_logging(std::string_view env_var, logging_config cfg) +{ + static auto _once = std::once_flag{}; + std::call_once(_once, [env_var, &cfg]() { auto get_argv0 = []() { auto ifs = std::ifstream{"/proc/self/cmdline"}; auto sarg = std::string{}; @@ -50,18 +61,16 @@ init_logging(std::string_view env_var) return sarg; }; - static auto argv0 = get_argv0(); - google::InitGoogleLogging(argv0.c_str()); - auto loglvl = common::get_env(env_var, "error"); + auto loglvl = common::get_env(env_var, ""); for(auto& itr : loglvl) itr = tolower(itr); // default to warning - auto loglvl_v = google::WARNING; - if(loglvl.find_first_not_of("0123456789") == std::string::npos) + auto& loglvl_v = cfg.loglevel; + if(!loglvl.empty() && loglvl.find_first_not_of("0123456789") == std::string::npos) { loglvl_v = std::stoul(loglvl); } - else + else if(!loglvl.empty()) { const auto opts = std::unordered_map{{"info", google::INFO}, @@ -77,10 +86,39 @@ init_logging(std::string_view env_var) loglvl_v = opts.at(loglvl); } - FLAGS_minloglevel = loglvl_v; - FLAGS_stderrthreshold = loglvl_v; + update_logging(cfg, true); + + if(!google::IsGoogleLoggingInitialized()) + { + static auto argv0 = get_argv0(); + google::InitGoogleLogging(argv0.c_str()); + } + + update_logging(cfg); + LOG(INFO) << "logging initialized via " << env_var; }); } + +void +update_logging(const logging_config& cfg, bool setup_env, int env_override) +{ + static auto _mtx = std::mutex{}; + auto _lk = std::unique_lock{_mtx}; + + FLAGS_timestamp_in_logfile_name = false; + FLAGS_minloglevel = cfg.loglevel; + FLAGS_stderrthreshold = cfg.loglevel; + FLAGS_logtostderr = cfg.logtostderr; + FLAGS_alsologtostderr = cfg.alsologtostderr; + if(cfg.install_failure_handler) install_failure_signal_handler(); + + if(setup_env) + { + common::set_env("GOOGLE_LOG_DIR", get_env("PWD", ""), env_override); + common::set_env("GOOGLE_LOGTOSTDERR", cfg.loglevel, env_override); + common::set_env("GOOGLE_ALSOLOGTOSTDERR", cfg.alsologtostderr, env_override); + } +} } // namespace common } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/common/logging.hpp b/projects/rocprofiler-sdk/source/lib/common/logging.hpp index 7e0cdce99a..5ea977be52 100644 --- a/projects/rocprofiler-sdk/source/lib/common/logging.hpp +++ b/projects/rocprofiler-sdk/source/lib/common/logging.hpp @@ -22,13 +22,27 @@ #pragma once +#include + +#include #include namespace rocprofiler { namespace common { +struct logging_config +{ + bool install_failure_handler = false; + bool logtostderr = true; + bool alsologtostderr = false; + int32_t loglevel = google::WARNING; +}; + void -init_logging(std::string_view env_var); -} +init_logging(std::string_view env_var, logging_config cfg = logging_config{}); + +void +update_logging(const logging_config& cfg, bool setup_env = false, int env_override = 0); +} // namespace common } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/CMakeLists.txt index 7d1eeec55d..ff5d0e5496 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/CMakeLists.txt +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/CMakeLists.txt @@ -12,7 +12,6 @@ add_subdirectory(plugins) target_link_libraries( rocprofiler-sdk-tool PRIVATE rocprofiler::rocprofiler-shared-library - rocprofiler::rocprofiler-hsa-runtime rocprofiler::rocprofiler-headers rocprofiler::rocprofiler-build-flags rocprofiler::rocprofiler-memcheck diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp index 63d9e1a61d..8149a8a2c6 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp @@ -57,19 +57,22 @@ struct config { config(); - bool demangle = get_env("ROCPROF_DEMANGLE_KERNELS", true); - bool truncate = get_env("ROCPROF_TRUNCATE_KERNELS", false); - bool kernel_trace = get_env("ROCPROF_KERNEL_TRACE", false); - bool hsa_api_trace = get_env("ROCPROF_HSA_API_TRACE", false); - bool marker_api_trace = get_env("ROCPROF_MARKER_API_TRACE", false); - bool memory_copy_trace = get_env("ROCPROF_MEMORY_COPY_TRACE", false); - bool counter_collection = get_env("ROCPROF_COUNTER_COLLECTION", false); - bool hip_api_trace = get_env("ROCPROF_HIP_API_TRACE", false); - bool hip_compiler_api_trace = get_env("ROCPROF_HIP_COMPILER_API_TRACE", false); - bool list_metrics = get_env("ROCPROF_LIST_METRICS", false); - bool list_metrics_output_file = get_env("ROCPROF_OUTPUT_LIST_METRICS_FILE", false); - int mpi_size = get_mpi_size(); - int mpi_rank = get_mpi_rank(); + bool demangle = get_env("ROCPROF_DEMANGLE_KERNELS", true); + bool truncate = get_env("ROCPROF_TRUNCATE_KERNELS", false); + bool kernel_trace = get_env("ROCPROF_KERNEL_TRACE", false); + bool hsa_core_api_trace = get_env("ROCPROF_HSA_CORE_API_TRACE", false); + bool hsa_amd_ext_api_trace = get_env("ROCPROF_HSA_AMD_EXT_API_TRACE", false); + bool hsa_image_ext_api_trace = get_env("ROCPROF_HSA_IMAGE_EXT_API_TRACE", false); + bool hsa_finalizer_ext_api_trace = get_env("ROCPROF_HSA_FINALIZER_EXT_API_TRACE", false); + bool marker_api_trace = get_env("ROCPROF_MARKER_API_TRACE", false); + bool memory_copy_trace = get_env("ROCPROF_MEMORY_COPY_TRACE", false); + bool counter_collection = get_env("ROCPROF_COUNTER_COLLECTION", false); + bool hip_runtime_api_trace = get_env("ROCPROF_HIP_RUNTIME_API_TRACE", false); + bool hip_compiler_api_trace = get_env("ROCPROF_HIP_COMPILER_API_TRACE", false); + bool list_metrics = get_env("ROCPROF_LIST_METRICS", false); + bool list_metrics_output_file = get_env("ROCPROF_OUTPUT_LIST_METRICS_FILE", false); + int mpi_size = get_mpi_size(); + int mpi_rank = get_mpi_rank(); std::string output_path = get_env("ROCPROF_OUTPUT_PATH", fs::current_path().string()); std::string output_file = get_env("ROCPROF_OUTPUT_FILE_NAME", std::to_string(getpid())); std::vector kernel_names = {}; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/helper.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/helper.cpp index c63753e9d1..ba954d77e6 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/helper.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/helper.cpp @@ -22,7 +22,8 @@ #include "helper.hpp" #include "config.hpp" -#include "rocprofiler-sdk/fwd.h" + +#include #include @@ -33,198 +34,6 @@ #include #include -namespace -{ -using amd_compute_pgm_rsrc_three32_t = uint32_t; - -// AMD Compute Program Resource Register Three. -enum amd_compute_gfx9_pgm_rsrc_three_t -{ - AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_ACCUM_OFFSET, 0, 5), - AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_TG_SPLIT, 16, 1) -}; - -enum amd_compute_gfx10_gfx11_pgm_rsrc_three_t -{ - AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_SHARED_VGPR_COUNT, 0, 4), - AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_INST_PREF_SIZE, 4, 6), - AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_TRAP_ON_START, 10, 1), - AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_TRAP_ON_END, 11, 1), - AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_IMAGE_OP, 31, 1) -}; - -// Kernel code properties. -enum amd_kernel_code_property_t -{ - AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, - 0, - 1), - AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, 1, 1), - AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, 2, 1), - AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR, - 3, - 1), - AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, 4, 1), - AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1), - AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, - 6, - 1), - AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_RESERVED0, 7, 3), - AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32, - 10, - 1), // GFX10+ - AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK, 11, 1), - AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_RESERVED1, 12, 4), -}; - -std::unordered_map kernel_descriptor_name_map; - -std::mutex kernel_properties_correlation_mutex; -std::unordered_map - kernel_properties_correlation_map; - -uint32_t -arch_vgpr_count(const std::string_view& name, const kernel_descriptor_t& kernel_code) -{ - std::string info_name(name.data(), name.size()); - if(strcmp(name.data(), "gfx90a") == 0 || strncmp(name.data(), "gfx94", 5) == 0) - return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc3, - AMD_COMPUTE_PGM_RSRC_THREE_ACCUM_OFFSET) + - 1) * - 4; - - return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1, - AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT) + - 1) * - (AMD_HSA_BITS_GET(kernel_code.kernel_code_properties, - AMD_KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32) - ? 8 - : 4); -} - -uint32_t -accum_vgpr_count(const std::string_view& name, const kernel_descriptor_t& kernel_code) -{ - std::string info_name(name.data(), name.size()); - if(strcmp(info_name.c_str(), "gfx908") == 0) return arch_vgpr_count(name, kernel_code); - if(strcmp(info_name.c_str(), "gfx90a") == 0 || strncmp(info_name.c_str(), "gfx94", 5) == 0) - return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1, - AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT) + - 1) * - 8 - - arch_vgpr_count(name, kernel_code); - - return 0; -} - -uint32_t -sgpr_count(const std::string_view& name, const kernel_descriptor_t& kernel_code) -{ - // GFX10 and later always allocate 128 sgprs. - - // TODO(srnagara): Recheck the extraction of gfxip from gpu name - - const char* name_data = name.data(); - const size_t gfxip_label_len = std::min(name.size() - 2, size_t{63}); - if(gfxip_label_len > 0 && strnlen(name_data, gfxip_label_len + 1) >= gfxip_label_len) - { - auto gfxip = std::vector{}; - gfxip.resize(gfxip_label_len + 1, '\0'); - memcpy(gfxip.data(), name_data, gfxip_label_len); - // TODO(srnagara): Check if it is hardcoded - if(std::stoi(&gfxip.at(3)) >= 10) return 128; - return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1, - AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT) / - 2 + - 1) * - 16; - } - return 0; -} - -const auto& -GetLoaderTable() -{ - static const auto _v = []() { - using hsa_loader_table_t = hsa_ven_amd_loader_1_01_pfn_t; - auto _tbl = hsa_loader_table_t{}; - memset(&_tbl, 0, sizeof(hsa_loader_table_t)); - hsa_system_get_major_extension_table( - HSA_EXTENSION_AMD_LOADER, 1, sizeof(hsa_loader_table_t), &_tbl); - return _tbl; - }(); - return _v; -} - -const kernel_descriptor_t* -GetKernelCode(uint64_t kernel_object) -{ - const kernel_descriptor_t* kernel_code = nullptr; - if(GetLoaderTable().hsa_ven_amd_loader_query_host_address == nullptr) return kernel_code; - hsa_status_t status = GetLoaderTable().hsa_ven_amd_loader_query_host_address( - reinterpret_cast(kernel_object), // NOLINT(performance-no-int-to-ptr) - reinterpret_cast(&kernel_code)); - if(HSA_STATUS_SUCCESS != status) - { - kernel_code = reinterpret_cast( // NOLINT(performance-no-int-to-ptr) - kernel_object); - } - return kernel_code; -} -} // namespace - -void -SetKernelProperties(uint64_t correlation_id, rocprofiler_tool_kernel_properties_t kernel_properties) -{ - std::lock_guard kernel_properties_correlation_map_lock( - kernel_properties_correlation_mutex); - kernel_properties_correlation_map[correlation_id] = std::move(kernel_properties); -} - -rocprofiler_tool_kernel_properties_t -GetKernelProperties(uint64_t correlation_id) -{ - std::lock_guard kernel_properties_correlation_map_lock( - kernel_properties_correlation_mutex); - auto it = kernel_properties_correlation_map.find(correlation_id); - if(it == kernel_properties_correlation_map.end()) - { - std::cout << "kernel properties not found" << std::endl; - abort(); - } - return it->second; -} - -void -populate_kernel_properties_data(rocprofiler_tool_kernel_properties_t* kernel_properties, - const hsa_kernel_dispatch_packet_t* dispatch_packet) -{ - const uint64_t kernel_object = dispatch_packet->kernel_object; - - const kernel_descriptor_t* kernel_code = GetKernelCode(kernel_object); - uint64_t grid_size = - dispatch_packet->grid_size_x * dispatch_packet->grid_size_y * dispatch_packet->grid_size_z; - if(grid_size > UINT32_MAX) abort(); - kernel_properties->grid_size = grid_size; - uint64_t workgroup_size = dispatch_packet->workgroup_size_x * - dispatch_packet->workgroup_size_y * dispatch_packet->workgroup_size_z; - if(workgroup_size > UINT32_MAX) abort(); - kernel_properties->workgroup_size = (uint32_t) workgroup_size; - kernel_properties->lds_size = dispatch_packet->group_segment_size; - kernel_properties->scratch_size = dispatch_packet->private_segment_size; - kernel_properties->arch_vgpr_count = - arch_vgpr_count(kernel_properties->gpu_agent.name, *kernel_code); - kernel_properties->accum_vgpr_count = - accum_vgpr_count(kernel_properties->gpu_agent.name, *kernel_code); - kernel_properties->sgpr_count = sgpr_count(kernel_properties->gpu_agent.name, *kernel_code); - kernel_properties->wave_size = - AMD_HSA_BITS_GET(kernel_code->kernel_code_properties, - AMD_KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32) - ? 32 - : 64; - kernel_properties->signal_handle = dispatch_packet->completion_signal.handle; -} - rocprofiler_tool_buffer_name_info_t get_buffer_id_names() { diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/helper.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/helper.hpp index 3599c0506b..035406302d 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/helper.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/helper.hpp @@ -71,46 +71,6 @@ constexpr size_t BUFFER_SIZE_BYTES = 4096; constexpr size_t WATERMARK = (BUFFER_SIZE_BYTES / 2); -// This can be different for different architecture -// Lets follow the v1 rocprof -// I will have a kernel id from the rocprofiler -// address the kernel descriptor and access the information -// This works for gfx9 but may not for Navi arch -// Interecept the kernel symbol load build a table for kernel id -// when kenel dispatch callback. Here is the kernel id -// Use the kernel id -typedef struct -{ - uint64_t grid_size; - uint64_t workgroup_size; - uint64_t lds_size; - uint64_t scratch_size; - uint64_t arch_vgpr_count; - uint64_t accum_vgpr_count; - uint64_t sgpr_count; - uint64_t wave_size; - uint64_t signal_handle; - uint64_t kernel_object; - rocprofiler_queue_id_t queue_id; - std::string kernel_name; - rocprofiler_agent_t gpu_agent; - uint64_t thread_id; - uint64_t dispatch_index; - -} rocprofiler_tool_kernel_properties_t; - -struct kernel_descriptor_t -{ - uint8_t reserved0[16]; - int64_t kernel_code_entry_byte_offset; - uint8_t reserved1[20]; - uint32_t compute_pgm_rsrc3; - uint32_t compute_pgm_rsrc1; - uint32_t compute_pgm_rsrc2; - uint16_t kernel_code_properties; - uint8_t reserved2[6]; -}; - using rocprofiler_tool_buffer_kind_names_t = std::unordered_map; using rocprofiler_tool_buffer_kind_operation_names_t = @@ -135,29 +95,6 @@ struct rocprofiler_tool_callback_name_info_t rocprofiler_tool_callback_kind_operation_names_t operation_names = {}; }; -// std::vector -// GetCounterNames(); - -void -SetKernelDescriptorName(rocprofiler_address_t kernel_descriptor, const char* name); - -void -SetKernelProperties(uint64_t correlation_id, - rocprofiler_tool_kernel_properties_t kernel_properties); -void -SetKernelProperties(uint64_t correlation_id, - rocprofiler_tool_kernel_properties_t kernel_properties); - -rocprofiler_tool_kernel_properties_t -GetKernelProperties(uint64_t correlation_id); - -const char* -GetKernelDescriptorName(rocprofiler_address_t kernel_descriptor); - -void -populate_kernel_properties_data(rocprofiler_tool_kernel_properties_t* kernel_properties, - const hsa_kernel_dispatch_packet_t* dispatch_packet); - rocprofiler_tool_buffer_name_info_t get_buffer_id_names(); diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp index 43ea6cf7b9..d75af77f49 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp @@ -33,6 +33,8 @@ #include "lib/common/utility.hpp" #include +#include +#include #include #include #include @@ -55,6 +57,21 @@ namespace common = ::rocprofiler::common; namespace tool = ::rocprofiler::tool; +namespace std +{ +template <> +struct hash +{ + size_t operator()(rocprofiler_agent_id_t id) const { return id.handle; } +}; +} // namespace std + +inline bool +operator==(rocprofiler_agent_id_t lhs, rocprofiler_agent_id_t rhs) +{ + return (lhs.handle == rhs.handle); +} + namespace { constexpr uint32_t lds_block_size = 128 * 4; @@ -68,16 +85,23 @@ get_dereference(Tp* ptr) return *CHECK_NOTNULL(ptr); } -template -void -add_destructor(Tp*& ptr) +auto +get_destructors_lock() { static auto _mutex = std::mutex{}; - auto _lk = std::unique_lock{_mutex}; + return std::unique_lock{_mutex}; +} + +template +Tp*& +add_destructor(Tp*& ptr) +{ + auto _lk = get_destructors_lock(); destructors->emplace_back([&ptr]() { delete ptr; ptr = nullptr; }); + return ptr; } #define ADD_DESTRUCTOR(PTR) \ @@ -155,7 +179,7 @@ get_counter_collection_file() "Process_Id", "Thread_Id", "Grid_Size", - "Kernel-Name", + "Kernel_Name", "Workgroup_Size", "LDS_Block_Size", "Scratch_Size", @@ -255,6 +279,7 @@ get_buffers() return _v; } +using rocprofiler_code_object_data_t = rocprofiler_callback_tracing_code_object_load_data_t; using rocprofiler_kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; @@ -274,14 +299,46 @@ struct kernel_symbol_data : rocprofiler_kernel_symbol_data_t std::string truncated_kernel_name = {}; }; +template +Tp* +as_pointer(Tp&& _val) +{ + return new Tp{std::forward(_val)}; +} + +using code_object_data_map_t = std::unordered_map; using kernel_symbol_data_map_t = std::unordered_map; -auto kernel_data = common::Synchronized{}; +using targeted_kernels_set_t = std::unordered_set; using counter_dimension_info_map_t = std::unordered_map>; -std::atomic dispatch_index{0}; -auto counter_dimension_data = common::Synchronized{}; -auto buffered_name_info = get_buffer_id_names(); -auto callback_name_info = get_callback_id_names(); + +auto code_obj_data = common::Synchronized{}; +auto kernel_data = common::Synchronized{}; +auto counter_dimension_data = common::Synchronized{}; +auto target_kernels = common::Synchronized{}; +auto dispatch_index = std::atomic{0}; +auto* buffered_name_info = as_pointer(get_buffer_id_names()); +auto* callback_name_info = as_pointer(get_callback_id_names()); + +bool +add_kernel_target(uint64_t _kern_id) +{ + return target_kernels + .wlock([](targeted_kernels_set_t& _targets_v, + uint64_t _kern_id_v) { return _targets_v.emplace(_kern_id_v); }, + _kern_id) + .second; +} + +bool +is_targeted_kernel(uint64_t _kern_id) +{ + return target_kernels.rlock( + [](const targeted_kernels_set_t& _targets_v, uint64_t _kern_id_v) { + return (_targets_v.count(_kern_id_v) > 0); + }, + _kern_id); +} auto& get_client_ctx() @@ -328,15 +385,16 @@ cntrl_tracing_callback(rocprofiler_callback_tracing_record_t record, auto ts = rocprofiler_timestamp_t{}; rocprofiler_get_timestamp(&ts); - const auto* kind_name = callback_name_info.kind_names.at(record.kind); + const auto* kind_name = CHECK_NOTNULL(callback_name_info)->kind_names.at(record.kind); if(record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER) { user_data->value = ts; } else { - const auto* op_name = - callback_name_info.operation_names.at(record.kind).at(record.operation); + const auto* op_name = CHECK_NOTNULL(callback_name_info) + ->operation_names.at(record.kind) + .at(record.operation); auto ss = std::stringstream{}; tool::csv::marker_csv_encoder::write_row(ss, kind_name, @@ -368,7 +426,7 @@ callback_tracing_callback(rocprofiler_callback_tracing_record_t record, auto ts = rocprofiler_timestamp_t{}; rocprofiler_get_timestamp(&ts); - const auto* kind_name = callback_name_info.kind_names.at(record.kind); + const auto* kind_name = CHECK_NOTNULL(callback_name_info)->kind_names.at(record.kind); if(record.operation == ROCPROFILER_MARKER_CORE_API_ID_roctxMarkA) { if(record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT) @@ -460,8 +518,9 @@ callback_tracing_callback(rocprofiler_callback_tracing_record_t record, } else { - const auto* op_name = - callback_name_info.operation_names.at(record.kind).at(record.operation); + const auto* op_name = CHECK_NOTNULL(callback_name_info) + ->operation_names.at(record.kind) + .at(record.operation); auto ss = std::stringstream{}; tool::csv::marker_csv_encoder::write_row(ss, kind_name, @@ -481,78 +540,6 @@ callback_tracing_callback(rocprofiler_callback_tracing_record_t record, (void) data; } -void -counter_record_callback(rocprofiler_queue_id_t, - const rocprofiler_agent_id_t, - rocprofiler_correlation_id_t correlation_id, - uint64_t, - void*, - size_t record_count, - rocprofiler_record_counter_t* record_data) -{ - rocprofiler_tool_kernel_properties_t kernel_properties = - GetKernelProperties(correlation_id.internal); - std::map counter_name_value; - for(size_t count = 0; count < record_count; count++) - { - auto profiler_record = static_cast(record_data[count]); - rocprofiler_counter_id_t counter_id; - rocprofiler_query_record_counter_id(profiler_record.id, &counter_id); - rocprofiler_counter_info_v0_t version; - ROCPROFILER_CALL( - rocprofiler_query_counter_info( - counter_id, ROCPROFILER_COUNTER_INFO_VERSION_0, static_cast(&version)), - "Could not query counter_id"); - const auto& dimension_pos_ss = counter_dimension_data.rlock( - [&profiler_record](const counter_dimension_info_map_t& counter_dimension_data_v, - uint64_t handle) { - auto dimensions = counter_dimension_data_v.at(handle); - size_t pos; - auto pos_ss = std::stringstream{}; - size_t num_dim = dimensions.size(); - for(size_t idx = 0; idx != num_dim; idx++) - { - rocprofiler_query_record_dimension_position( - profiler_record.id, dimensions[idx].id, &pos); - pos_ss << dimensions[idx].name << ":" << pos; - if(idx != num_dim - 1) pos_ss << ","; - } - return pos_ss; - }, - counter_id.handle); - auto search = counter_name_value.find(version.name); - if(search == counter_name_value.end()) - counter_name_value.emplace( - std::pair{version.name, profiler_record.counter_value}); - else - search->second = search->second + profiler_record.counter_value; - } - - for(auto itr = counter_name_value.begin(); itr != counter_name_value.end(); ++itr) - { - auto counter_collection_ss = std::stringstream{}; - tool::csv::counter_collection_csv_encoder::write_row( - counter_collection_ss, - correlation_id.internal, - kernel_properties.dispatch_index, - kernel_properties.gpu_agent.id.handle, - kernel_properties.queue_id.handle, - getpid(), - kernel_properties.thread_id, - kernel_properties.grid_size, - kernel_properties.kernel_name, - kernel_properties.workgroup_size, - ((kernel_properties.lds_size + (lds_block_size - 1)) & ~(lds_block_size - 1)), - kernel_properties.scratch_size, - kernel_properties.arch_vgpr_count, - kernel_properties.sgpr_count, - itr->first, - itr->second); - - get_dereference(get_counter_collection_file()) << counter_collection_ss.str(); - } -} - void code_object_tracing_callback(rocprofiler_callback_tracing_record_t record, rocprofiler_user_data_t* user_data, @@ -561,7 +548,19 @@ code_object_tracing_callback(rocprofiler_callback_tracing_record_t record, if(record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT && record.operation == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_LOAD) { - if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD) + if(record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD) + { + auto* obj_data = static_cast(record.payload); + if(record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD) + { + code_obj_data.wlock( + [](code_object_data_map_t& cdata, rocprofiler_code_object_data_t* obj_data_v) { + cdata.emplace(obj_data_v->code_object_id, *obj_data_v); + }, + CHECK_NOTNULL(obj_data)); + } + } + else if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD) { flush(); } @@ -573,11 +572,50 @@ code_object_tracing_callback(rocprofiler_callback_tracing_record_t record, auto* sym_data = static_cast(record.payload); if(record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD) { - kernel_data.wlock( + auto itr = kernel_data.wlock( [](kernel_symbol_data_map_t& kdata, rocprofiler_kernel_symbol_data_t* sym_data_v) { - kdata.emplace(sym_data_v->kernel_id, kernel_symbol_data{*sym_data_v}); + return kdata.emplace(sym_data_v->kernel_id, kernel_symbol_data{*sym_data_v}); }, - sym_data); + CHECK_NOTNULL(sym_data)); + + LOG_IF(WARNING, !itr.second) + << "duplicate kernel symbol data for kernel_id=" << sym_data->kernel_id; + + // add the kernel to the kernel_targets if + if(itr.second) + { + // if kernel name is provided by user then by default all kernels in the application + // are targeted + if(tool::get_config().kernel_names.empty()) + { + add_kernel_target(sym_data->kernel_id); + } + else + { + const auto& kernel_info = itr.first->second; + for(const auto& name : tool::get_config().kernel_names) + { + if(name == kernel_info.truncated_kernel_name) + { + add_kernel_target(itr.first->first); + break; + } + else + { + auto dkernel_name = std::string_view{kernel_info.demangled_kernel_name}; + auto pos = dkernel_name.find(name); + // if the demangled kernel name contains name and the next character is + // '(' then mark as found + if(pos != std::string::npos && (pos + 1) < dkernel_name.size() && + dkernel_name.at(pos + 1) == '(') + { + add_kernel_target(itr.first->first); + break; + } + } + } + } + } } } @@ -622,7 +660,7 @@ buffered_tracing_callback(rocprofiler_context_id_t /*context*/, auto kernel_trace_ss = std::stringstream{}; tool::csv::kernel_trace_csv_encoder::write_row( kernel_trace_ss, - buffered_name_info.kind_names.at(record->kind), + CHECK_NOTNULL(buffered_name_info)->kind_names.at(record->kind), record->agent_id.handle, record->queue_id.handle, record->kernel_id, @@ -652,8 +690,10 @@ buffered_tracing_callback(rocprofiler_context_id_t /*context*/, auto hsa_trace_ss = std::stringstream{}; tool::csv::api_csv_encoder::write_row( hsa_trace_ss, - buffered_name_info.kind_names.at(record->kind), - buffered_name_info.operation_names.at(record->kind).at(record->operation), + CHECK_NOTNULL(buffered_name_info)->kind_names.at(record->kind), + CHECK_NOTNULL(buffered_name_info) + ->operation_names.at(record->kind) + .at(record->operation), getpid(), record->thread_id, record->correlation_id.internal, @@ -670,8 +710,10 @@ buffered_tracing_callback(rocprofiler_context_id_t /*context*/, auto memory_copy_trace_ss = std::stringstream{}; tool::csv::memory_copy_csv_encoder::write_row( memory_copy_trace_ss, - buffered_name_info.kind_names.at(record->kind), - buffered_name_info.operation_names.at(record->kind).at(record->operation), + CHECK_NOTNULL(buffered_name_info)->kind_names.at(record->kind), + CHECK_NOTNULL(buffered_name_info) + ->operation_names.at(record->kind) + .at(record->operation), record->src_agent_id.handle, record->dst_agent_id.handle, record->correlation_id.internal, @@ -689,8 +731,10 @@ buffered_tracing_callback(rocprofiler_context_id_t /*context*/, auto hip_trace_ss = std::stringstream{}; tool::csv::api_csv_encoder::write_row( hip_trace_ss, - buffered_name_info.kind_names.at(record->kind), - buffered_name_info.operation_names.at(record->kind).at(record->operation), + CHECK_NOTNULL(buffered_name_info)->kind_names.at(record->kind), + CHECK_NOTNULL(buffered_name_info) + ->operation_names.at(record->kind) + .at(record->operation), getpid(), record->thread_id, record->correlation_id.internal, @@ -710,7 +754,7 @@ buffered_tracing_callback(rocprofiler_context_id_t /*context*/, using counter_vec_t = std::vector; using agent_counter_map_t = - std::unordered_map>; + std::unordered_map>; rocprofiler_status_t dimensions_info_callback(rocprofiler_counter_id_t id, @@ -740,14 +784,14 @@ dimensions_info_callback(rocprofiler_counter_id_t id, // this function creates a rocprofiler profile config on the first entry auto -get_agent_profile(const rocprofiler_agent_t* agent) +get_agent_profile(rocprofiler_agent_id_t agent_id) { static auto data = common::Synchronized{}; auto profile = std::optional{}; data.ulock( - [agent, &profile](const agent_counter_map_t& data_v) { - auto itr = data_v.find(agent); + [agent_id, &profile](const agent_counter_map_t& data_v) { + auto itr = data_v.find(agent_id); if(itr != data_v.end()) { profile = itr->second; @@ -755,11 +799,11 @@ get_agent_profile(const rocprofiler_agent_t* agent) } return false; }, - [agent, &profile](agent_counter_map_t& data_v) { + [agent_id, &profile](agent_counter_map_t& data_v) { auto counters_v = counter_vec_t{}; ROCPROFILER_CALL( rocprofiler_iterate_agent_supported_counters( - agent->id, + agent_id, [](rocprofiler_agent_id_t, rocprofiler_counter_id_t* counters, size_t num_counters, @@ -771,15 +815,15 @@ get_agent_profile(const rocprofiler_agent_t* agent) counters[i], dimensions_info_callback, nullptr), "iterate_dimension_info"); - rocprofiler_counter_info_v0_t version; + rocprofiler_counter_info_v0_t info; ROCPROFILER_CALL( rocprofiler_query_counter_info(counters[i], ROCPROFILER_COUNTER_INFO_VERSION_0, - static_cast(&version)), + static_cast(&info)), "Could not query counter_id"); - if(tool::get_config().counters.count(version.name) > 0) + if(tool::get_config().counters.count(info.name) > 0) vec->emplace_back(counters[i]); } return ROCPROFILER_STATUS_SUCCESS; @@ -791,18 +835,122 @@ get_agent_profile(const rocprofiler_agent_t* agent) { auto profile_v = rocprofiler_profile_config_id_t{}; ROCPROFILER_CALL(rocprofiler_create_profile_config( - agent->id, counters_v.data(), counters_v.size(), &profile_v), + agent_id, counters_v.data(), counters_v.size(), &profile_v), "Could not construct profile cfg"); profile = profile_v; } - data_v.emplace(agent, profile); + data_v.emplace(agent_id, profile); return true; }); return profile; } +struct counter_dispatch_data +{ + uint64_t thread_id = 0; + uint64_t dispatch_index = 0; +}; + +void +dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data, + rocprofiler_profile_config_id_t* config, + rocprofiler_user_data_t* user_data, + void* /*callback_data_args*/) +{ + auto kernel_id = dispatch_data.kernel_id; + auto agent_id = dispatch_data.agent_id; + + if(!is_targeted_kernel(kernel_id)) + { + return; + } + else if(auto profile = get_agent_profile(agent_id)) + { + *config = *profile; + user_data->ptr = new counter_dispatch_data{.thread_id = common::get_tid(), + .dispatch_index = ++dispatch_index}; + } +} + +void +counter_record_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data, + rocprofiler_record_counter_t* record_data, + size_t record_count, + rocprofiler_user_data_t user_data, + void* /*callback_data_args*/) +{ + auto kernel_id = dispatch_data.kernel_id; + const auto* cnt_dispatch_data_v = static_cast(user_data.ptr); + const auto* kernel_info = kernel_data.rlock( + [](const kernel_symbol_data_map_t& kdata, uint64_t kid) -> const auto* { + return &kdata.at(kid); + }, + kernel_id); + + LOG_IF(FATAL, !kernel_info) << "missing kernel information for kernel_id=" << kernel_id; + + LOG_IF(ERROR, record_count == 0) << "zero record count for kernel_id=" << kernel_id + << " (name=" << kernel_info->kernel_name << ")"; + + auto counter_name_value = std::map{}; + for(size_t count = 0; count < record_count; count++) + { + auto profiler_record = static_cast(record_data[count]); + auto counter_id = rocprofiler_counter_id_t{}; + auto info = rocprofiler_counter_info_v0_t{}; + + ROCPROFILER_CALL(rocprofiler_query_record_counter_id(profiler_record.id, &counter_id), + "query record counter id"); + ROCPROFILER_CALL( + rocprofiler_query_counter_info( + counter_id, ROCPROFILER_COUNTER_INFO_VERSION_0, static_cast(&info)), + "query counter info"); + + auto search = counter_name_value.find(info.name); + if(search == counter_name_value.end()) + counter_name_value.emplace( + std::pair{info.name, profiler_record.counter_value}); + else + search->second = search->second + profiler_record.counter_value; + } + + auto lds_block_size_v = + (kernel_info->group_segment_size + (lds_block_size - 1)) & ~(lds_block_size - 1); + + const auto& correlation_id = dispatch_data.correlation_id; + + auto magnitude = [](rocprofiler_dim3_t dims) { return (dims.x * dims.y * dims.z); }; + + for(auto& itr : counter_name_value) + { + using csv_encoder = tool::csv::counter_collection_csv_encoder; + + auto counter_collection_ss = std::stringstream{}; + csv_encoder::write_row(counter_collection_ss, + correlation_id.internal, + cnt_dispatch_data_v->dispatch_index, + dispatch_data.agent_id.handle, + dispatch_data.queue_id.handle, + getpid(), + cnt_dispatch_data_v->thread_id, + magnitude(dispatch_data.grid_size), + kernel_info->formatted_kernel_name, + magnitude(dispatch_data.workgroup_size), + lds_block_size_v, + kernel_info->private_segment_size, + kernel_info->arch_vgpr_count, + kernel_info->sgpr_count, + itr.first, + itr.second); + + get_dereference(get_counter_collection_file()) << counter_collection_ss.str(); + } + + delete cnt_dispatch_data_v; +} + rocprofiler_status_t list_metrics_iterate_agents(rocprofiler_agent_version_t, const void** agents, @@ -912,61 +1060,6 @@ list_metrics_iterate_agents(rocprofiler_agent_version_t, return ROCPROFILER_STATUS_SUCCESS; } -void -dispatch_callback(rocprofiler_queue_id_t queue_id, - const rocprofiler_agent_t* agent, - rocprofiler_correlation_id_t correlation_id, - const hsa_kernel_dispatch_packet_t* dispatch_packet, - uint64_t kernel_id, - void* /*callback_data_args*/, - rocprofiler_profile_config_id_t* config) -{ - rocprofiler_tool_kernel_properties_t kernel_properties; - const auto& kernel_info = - kernel_data.rlock([](const kernel_symbol_data_map_t& kdata, - uint64_t kernel_id_v) { return kdata.at(kernel_id_v); }, - kernel_id); - auto is_targeted_kernel = [&kernel_info]() { - // if kernel name is provided by user then by default all kernels in the application are - // targeted - if(tool::get_config().kernel_names.empty()) return true; - - for(const auto& name : tool::get_config().kernel_names) - { - if(name == kernel_info.truncated_kernel_name) - return true; - else - { - auto dkernel_name = std::string_view{kernel_info.demangled_kernel_name}; - auto pos = dkernel_name.find(name); - // if the demangled kernel name contains name and the next character is '(' then - // mark as found - if(pos != std::string::npos && (pos + 1) < dkernel_name.size() && - dkernel_name.at(pos + 1) == '(') - return true; - } - } - return false; - }; - - if(!is_targeted_kernel()) return; - - auto profile = get_agent_profile(agent); - - if(profile) - { - kernel_properties.kernel_name = kernel_info.formatted_kernel_name; - kernel_properties.dispatch_index = ++dispatch_index; - kernel_properties.queue_id = queue_id; - kernel_properties.gpu_agent = *agent; - kernel_properties.thread_id = common::get_tid(); - populate_kernel_properties_data(&kernel_properties, dispatch_packet); - SetKernelProperties(correlation_id.internal, kernel_properties); - - *config = *profile; - } -} - rocprofiler_client_finalize_t client_finalizer = nullptr; rocprofiler_client_id_t* client_identifier = nullptr; @@ -1059,7 +1152,8 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) "buffer tracing service for memory copy configure"); } - if(tool::get_config().hsa_api_trace) + if(tool::get_config().hsa_core_api_trace || tool::get_config().hsa_amd_ext_api_trace || + tool::get_config().hsa_image_ext_api_trace || tool::get_config().hsa_finalizer_ext_api_trace) { ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(), buffer_size, @@ -1070,18 +1164,27 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) &get_buffers().hsa_api_trace), "buffer creation"); - for(auto itr : {ROCPROFILER_BUFFER_TRACING_HSA_CORE_API, - ROCPROFILER_BUFFER_TRACING_HSA_AMD_EXT_API, - ROCPROFILER_BUFFER_TRACING_HSA_IMAGE_EXT_API, - ROCPROFILER_BUFFER_TRACING_HSA_FINALIZE_EXT_API}) + using optpair_t = std::pair; + for(auto itr : {optpair_t{tool::get_config().hsa_core_api_trace, + ROCPROFILER_BUFFER_TRACING_HSA_CORE_API}, + optpair_t{tool::get_config().hsa_core_api_trace, + ROCPROFILER_BUFFER_TRACING_HSA_AMD_EXT_API}, + optpair_t{tool::get_config().hsa_core_api_trace, + ROCPROFILER_BUFFER_TRACING_HSA_IMAGE_EXT_API}, + optpair_t{tool::get_config().hsa_core_api_trace, + ROCPROFILER_BUFFER_TRACING_HSA_FINALIZE_EXT_API}}) { - ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service( - get_client_ctx(), itr, nullptr, 0, get_buffers().hsa_api_trace), - "buffer tracing service for hsa api configure"); + if(itr.first) + { + ROCPROFILER_CALL( + rocprofiler_configure_buffer_tracing_service( + get_client_ctx(), itr.second, nullptr, 0, get_buffers().hsa_api_trace), + "buffer tracing service for hsa api configure"); + } } } - if(tool::get_config().hip_api_trace || tool::get_config().hip_compiler_api_trace) + if(tool::get_config().hip_runtime_api_trace || tool::get_config().hip_compiler_api_trace) { ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(), buffer_size, @@ -1092,7 +1195,7 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) &get_buffers().hip_api_trace), "buffer creation"); - if(tool::get_config().hip_api_trace) + if(tool::get_config().hip_runtime_api_trace) { ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service( get_client_ctx(), @@ -1187,7 +1290,8 @@ rocprofiler_configure(uint32_t version, uint32_t priority, rocprofiler_client_id_t* id) { - common::init_logging("ROCPROF_LOG_LEVEL"); + auto logging_cfg = rocprofiler::common::logging_config{.install_failure_handler = true}; + common::init_logging("ROCPROF_LOG_LEVEL", logging_cfg); FLAGS_colorlogtostderr = true; // set the client name @@ -1205,6 +1309,10 @@ rocprofiler_configure(uint32_t version, uint32_t minor = (version % 10000) / 100; uint32_t patch = version % 100; + // ensure these pointers are not leaked + add_destructor(buffered_name_info); + add_destructor(callback_name_info); + if(tool::get_config().list_metrics) { ROCPROFILER_CALL(rocprofiler_at_intercept_table_registration( diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/CMakeLists.txt index a6f6003dac..ca1f39142b 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/CMakeLists.txt +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/CMakeLists.txt @@ -67,8 +67,9 @@ set_target_properties(rocprofiler-object-library PROPERTIES POSITION_INDEPENDENT add_library(rocprofiler-shared-library SHARED) add_library(rocprofiler::rocprofiler-shared-library ALIAS rocprofiler-shared-library) -target_sources(rocprofiler-shared-library - PRIVATE $) +target_sources( + rocprofiler-shared-library + PRIVATE $ shared_library.cpp) target_link_libraries( rocprofiler-shared-library INTERFACE rocprofiler::rocprofiler-headers diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/agent.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/agent.cpp index 5ce4468991..f0f409c739 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/agent.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/agent.cpp @@ -48,6 +48,14 @@ namespace fs = rocprofiler::common::filesystem; +#if defined(ROCPROFILER_CI) +# define ROCP_CI_LOG_IF(NON_CI_LEVEL, ...) LOG_IF(FATAL, __VA_ARGS__) +# define ROCP_CI_LOG(NON_CI_LEVEL, ...) LOG(FATAL) +#else +# define ROCP_CI_LOG_IF(NON_CI_LEVEL, ...) LOG_IF(NON_CI_LEVEL, __VA_ARGS__) +# define ROCP_CI_LOG(NON_CI_LEVEL, ...) LOG(NON_CI_LEVEL) +#endif + namespace rocprofiler { namespace agent @@ -697,7 +705,7 @@ construct_agent_cache(::HsaApiTable* table) }, &hsa_agents); - LOG_IF(FATAL, rocp_agents.size() != hsa_agents.size()) + ROCP_CI_LOG_IF(ERROR, rocp_agents.size() != hsa_agents.size()) << "Found " << rocp_agents.size() << " rocprofiler agents and " << hsa_agents.size() << " HSA agents"; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/core.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/core.cpp index e3649ab8d9..c7a7f0b2c5 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/core.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/core.cpp @@ -24,6 +24,7 @@ #include "lib/common/container/small_vector.hpp" #include "lib/common/synchronized.hpp" +#include "lib/common/utility.hpp" #include "lib/rocprofiler-sdk/agent.hpp" #include "lib/rocprofiler-sdk/aql/helpers.hpp" #include "lib/rocprofiler-sdk/aql/packet_construct.hpp" @@ -32,6 +33,7 @@ #include "lib/rocprofiler-sdk/hsa/queue_controller.hpp" #include "lib/rocprofiler-sdk/registration.hpp" +#include #include namespace rocprofiler @@ -255,6 +257,7 @@ queue_cb(const std::shared_ptr& info, const hsa::Queue& queue, const hsa::rocprofiler_packet& pkt, uint64_t kernel_id, + rocprofiler_user_data_t* user_data, const hsa::Queue::queue_info_session_t::external_corr_id_map_t& extern_corr_ids, const context::correlation_id* correlation_id) { @@ -272,14 +275,25 @@ queue_cb(const std::shared_ptr& info, } } - rocprofiler_profile_config_id_t req_profile = {.handle = 0}; - info->user_cb(queue.get_id(), - queue.get_agent().get_rocp_agent(), - _corr_id_v, - &pkt.kernel_dispatch, - kernel_id, - info->callback_args, - &req_profile); + auto req_profile = rocprofiler_profile_config_id_t{.handle = 0}; + auto dispatch_data = + common::init_public_api_struct(rocprofiler_profile_counting_dispatch_data_t{}); + + dispatch_data.kernel_id = kernel_id; + dispatch_data.agent_id = CHECK_NOTNULL(queue.get_agent().get_rocp_agent())->id; + dispatch_data.queue_id = queue.get_id(); + dispatch_data.correlation_id = _corr_id_v; + dispatch_data.private_segment_size = pkt.kernel_dispatch.private_segment_size; + dispatch_data.group_segment_size = pkt.kernel_dispatch.group_segment_size; + dispatch_data.workgroup_size = {pkt.kernel_dispatch.workgroup_size_x, + pkt.kernel_dispatch.workgroup_size_y, + pkt.kernel_dispatch.workgroup_size_z}; + dispatch_data.grid_size = {pkt.kernel_dispatch.grid_size_x, + pkt.kernel_dispatch.grid_size_y, + pkt.kernel_dispatch.grid_size_z}; + + info->user_cb(dispatch_data, &req_profile, user_data, info->callback_args); + if(req_profile.handle == 0) return nullptr; auto prof_config = get_controller().get_profile_cfg(req_profile); @@ -407,13 +421,27 @@ completed_cb(const std::shared_ptr& info, if(!out.empty()) { CHECK(info->record_callback); - info->record_callback(queue.get_id(), - queue.get_agent().get_rocp_agent()->id, - _corr_id_v, - session.kernel_id, - info->record_callback_args, - out.size(), - out.data()); + + auto dispatch_data = + common::init_public_api_struct(rocprofiler_profile_counting_dispatch_data_t{}); + + const auto& kernel_dispatch_pkt = session.kernel_pkt.kernel_dispatch; + + dispatch_data.kernel_id = session.kernel_id; + dispatch_data.agent_id = CHECK_NOTNULL(queue.get_agent().get_rocp_agent())->id; + dispatch_data.queue_id = queue.get_id(); + dispatch_data.correlation_id = _corr_id_v; + dispatch_data.private_segment_size = kernel_dispatch_pkt.private_segment_size; + dispatch_data.group_segment_size = kernel_dispatch_pkt.group_segment_size; + dispatch_data.workgroup_size = {kernel_dispatch_pkt.workgroup_size_x, + kernel_dispatch_pkt.workgroup_size_y, + kernel_dispatch_pkt.workgroup_size_z}; + dispatch_data.grid_size = {kernel_dispatch_pkt.grid_size_x, + kernel_dispatch_pkt.grid_size_y, + kernel_dispatch_pkt.grid_size_z}; + + info->record_callback( + dispatch_data, out.data(), out.size(), session.user_data, info->record_callback_args); } } @@ -436,9 +464,11 @@ start_context(const context::context* ctx) [=](const hsa::Queue& q, const hsa::rocprofiler_packet& kern_pkt, uint64_t kernel_id, + rocprofiler_user_data_t* user_data, const hsa::Queue::queue_info_session_t::external_corr_id_map_t& extern_corr_ids, const context::correlation_id* correlation_id) { - return queue_cb(cb, q, kern_pkt, kernel_id, extern_corr_ids, correlation_id); + return queue_cb( + cb, q, kern_pkt, kernel_id, user_data, extern_corr_ids, correlation_id); }, // Completion CB [=](const hsa::Queue& q, diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/core.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/core.hpp index 9bfcca954f..8fe76c47b3 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/core.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/core.hpp @@ -24,6 +24,7 @@ #include #include +#include #include "lib/rocprofiler-sdk/aql/helpers.hpp" #include "lib/rocprofiler-sdk/aql/packet_construct.hpp" @@ -80,7 +81,6 @@ struct counter_callback_info // HSA Queue ClientID. This is an ID we get when we insert a callback into the // HSA queue interceptor. This ID can be used to disable the callback. rocprofiler::hsa::ClientID queue_id{-1}; - // Buffer to use for storing counter data. Used if callback is not set. std::optional buffer; @@ -130,6 +130,7 @@ queue_cb(const std::shared_ptr& info, const hsa::Queue& queue, const hsa::rocprofiler_packet& pkt, uint64_t kernel_id, + rocprofiler_user_data_t* user_data, const hsa::Queue::queue_info_session_t::external_corr_id_map_t& extern_corr_ids, const context::correlation_id* correlation_id); diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/core.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/core.cpp index 98195a2f1e..10cf96b03f 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/core.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/core.cpp @@ -20,10 +20,23 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include -#include -#include -#include +#include "lib/rocprofiler-sdk/counters/core.hpp" +#include "lib/common/static_object.hpp" +#include "lib/common/utility.hpp" +#include "lib/rocprofiler-sdk/agent.hpp" +#include "lib/rocprofiler-sdk/buffer.hpp" +#include "lib/rocprofiler-sdk/context/context.hpp" +#include "lib/rocprofiler-sdk/counters/id_decode.hpp" +#include "lib/rocprofiler-sdk/counters/metrics.hpp" +#include "lib/rocprofiler-sdk/hsa/agent_cache.hpp" +#include "lib/rocprofiler-sdk/hsa/queue.hpp" +#include "lib/rocprofiler-sdk/hsa/queue_controller.hpp" +#include "lib/rocprofiler-sdk/registration.hpp" + +#include +#include +#include +#include #include #include @@ -31,21 +44,10 @@ #include #include -#include - -#include "lib/common/static_object.hpp" -#include "lib/common/utility.hpp" -#include "lib/rocprofiler-sdk/agent.hpp" -#include "lib/rocprofiler-sdk/buffer.hpp" -#include "lib/rocprofiler-sdk/context/context.hpp" -#include "lib/rocprofiler-sdk/counters/core.hpp" -#include "lib/rocprofiler-sdk/counters/id_decode.hpp" -#include "lib/rocprofiler-sdk/counters/metrics.hpp" -#include "lib/rocprofiler-sdk/hsa/agent_cache.hpp" -#include "lib/rocprofiler-sdk/hsa/queue.hpp" -#include "lib/rocprofiler-sdk/hsa/queue_controller.hpp" -#include "lib/rocprofiler-sdk/registration.hpp" -#include "rocprofiler-sdk/registration.h" +#include +#include +#include +#include using namespace rocprofiler::counters; using namespace rocprofiler; @@ -204,13 +206,10 @@ buffered_callback(rocprofiler_context_id_t, } void -null_dispatch_callback(rocprofiler_queue_id_t, - const rocprofiler_agent_t*, - rocprofiler_correlation_id_t, - const hsa_kernel_dispatch_packet_t*, - uint64_t, - void*, - rocprofiler_profile_config_id_t*) +null_dispatch_callback(rocprofiler_profile_counting_dispatch_data_t, + rocprofiler_profile_config_id_t*, + rocprofiler_user_data_t*, + void*) {} void @@ -223,13 +222,11 @@ null_buffered_callback(rocprofiler_context_id_t, {} void -null_record_callback(rocprofiler_queue_id_t, - rocprofiler_agent_id_t, - rocprofiler_correlation_id_t, - uint64_t, - void*, +null_record_callback(rocprofiler_profile_counting_dispatch_data_t, + rocprofiler_record_counter_t*, size_t, - rocprofiler_record_counter_t*) + rocprofiler_user_data_t, + void*) {} } // namespace @@ -326,10 +323,10 @@ public: , _agent(a) , _id(id) {} - virtual const AgentCache& get_agent() const override final { return _agent; }; - virtual rocprofiler_queue_id_t get_id() const override final { return _id; }; + const AgentCache& get_agent() const final { return _agent; }; + rocprofiler_queue_id_t get_id() const final { return _id; }; - ~FakeQueue() {} + ~FakeQueue() override = default; private: const AgentCache& _agent; @@ -339,39 +336,60 @@ private: } // namespace hsa } // namespace rocprofiler +bool +operator==(rocprofiler_dim3_t lhs, rocprofiler_dim3_t rhs) +{ + return std::tie(lhs.x, lhs.y, lhs.z) == std::tie(rhs.x, rhs.y, rhs.z); +} + +bool +operator==(rocprofiler_agent_id_t lhs, rocprofiler_agent_id_t rhs) +{ + return (lhs.handle == rhs.handle); +} + namespace { struct expected_dispatch { // To pass back - rocprofiler_profile_config_id_t id; - - rocprofiler_queue_id_t queue_id; - const rocprofiler_agent_t* agent; - rocprofiler_correlation_id_t correlation_id; - hsa_kernel_dispatch_packet_t* dispatch_packet; - uint64_t kernel_id; - rocprofiler_profile_config_id_t* config; + rocprofiler_profile_config_id_t id = {}; + rocprofiler_queue_id_t queue_id = {.handle = 0}; + rocprofiler_agent_id_t agent_id = {.handle = 0}; + uint64_t kernel_id = 0; + rocprofiler_correlation_id_t correlation_id = {.internal = 0, .external = {.value = 0}}; + rocprofiler_dim3_t workgroup_size = {0, 0, 0}; + rocprofiler_dim3_t grid_size = {0, 0, 0}; + rocprofiler_profile_config_id_t* config = nullptr; }; void -user_dispatch_cb(rocprofiler_queue_id_t queue_id, - const rocprofiler_agent_t* agent, - rocprofiler_correlation_id_t correlation_id, - const hsa_kernel_dispatch_packet_t* dispatch_packet, - uint64_t kernel_id, - void* callback_data_args, - rocprofiler_profile_config_id_t* config) +user_dispatch_cb(rocprofiler_profile_counting_dispatch_data_t dispatch_data, + rocprofiler_profile_config_id_t* config, + rocprofiler_user_data_t* user_data, + void* callback_data_args) { expected_dispatch& expected = *static_cast(callback_data_args); - ASSERT_EQ(expected.agent, agent); - ASSERT_EQ(expected.queue_id.handle, queue_id.handle); - ASSERT_EQ(expected.correlation_id.internal, correlation_id.internal); - ASSERT_EQ(expected.correlation_id.external.ptr, correlation_id.external.ptr); - ASSERT_EQ(expected.correlation_id.external.value, correlation_id.external.value); - ASSERT_EQ(expected.dispatch_packet, dispatch_packet); - ASSERT_EQ(expected.kernel_id, kernel_id); + + auto agent_id = dispatch_data.agent_id; + auto queue_id = dispatch_data.queue_id; + auto correlation_id = dispatch_data.correlation_id; + auto kernel_id = dispatch_data.kernel_id; + + EXPECT_EQ(sizeof(rocprofiler_profile_counting_dispatch_data_t), dispatch_data.size); + EXPECT_EQ(expected.kernel_id, kernel_id); + EXPECT_EQ(expected.agent_id, agent_id); + EXPECT_EQ(expected.queue_id.handle, queue_id.handle); + EXPECT_EQ(expected.correlation_id.internal, correlation_id.internal); + EXPECT_EQ(expected.correlation_id.external.ptr, correlation_id.external.ptr); + EXPECT_EQ(expected.correlation_id.external.value, correlation_id.external.value); + EXPECT_EQ(expected.workgroup_size, dispatch_data.workgroup_size); + EXPECT_EQ(expected.grid_size, dispatch_data.grid_size); + + ASSERT_NE(config, nullptr); config->handle = expected.id.handle; + + (void) user_data; } } // namespace @@ -440,17 +458,22 @@ TEST(core, check_callbacks) hsa::rocprofiler_packet pkt; pkt.ext_amd_aql_pm4.header = count++; - expected.correlation_id = {.internal = corr_id.internal, + expected.correlation_id = {.internal = corr_id.internal, .external = context::null_user_data}; - expected.dispatch_packet = &pkt.kernel_dispatch; - expected.kernel_id = count++; - expected.queue_id = qid; - expected.agent = fq.get_agent().get_rocp_agent(); + expected.workgroup_size = {pkt.kernel_dispatch.workgroup_size_x, + pkt.kernel_dispatch.workgroup_size_y, + pkt.kernel_dispatch.workgroup_size_z}; + expected.grid_size = {pkt.kernel_dispatch.grid_size_x, + pkt.kernel_dispatch.grid_size_y, + pkt.kernel_dispatch.grid_size_z}; + expected.kernel_id = count++; + expected.queue_id = qid; + expected.agent_id = fq.get_agent().get_rocp_agent()->id; hsa::Queue::queue_info_session_t::external_corr_id_map_t extern_ids = {}; - - auto ret_pkt = - counters::queue_cb(cb_info, fq, pkt, expected.kernel_id, extern_ids, &corr_id); + auto user_data = rocprofiler_user_data_t{.value = corr_id.internal}; + auto ret_pkt = counters::queue_cb( + cb_info, fq, pkt, expected.kernel_id, &user_data, extern_ids, &corr_id); ASSERT_TRUE(ret_pkt) << fmt::format("Expected a packet to be generated for - {}", metric.name()); diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/dimension.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/dimension.cpp index a8290c8b81..b50fed0600 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/dimension.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/dimension.cpp @@ -20,14 +20,6 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include - -#include -#include -#include -#include -#include - #include "lib/common/static_object.hpp" #include "lib/common/utility.hpp" #include "lib/rocprofiler-sdk/agent.hpp" @@ -42,7 +34,15 @@ #include "lib/rocprofiler-sdk/hsa/queue.hpp" #include "lib/rocprofiler-sdk/hsa/queue_controller.hpp" #include "lib/rocprofiler-sdk/registration.hpp" -#include "rocprofiler-sdk/registration.h" + +#include +#include + +#include +#include +#include +#include +#include namespace { diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/init_order.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/init_order.cpp index 7e618616da..8539e024a1 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/init_order.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/init_order.cpp @@ -20,16 +20,6 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include -#include -#include -#include - -#include -#include - -#include - #include "lib/common/static_object.hpp" #include "lib/common/utility.hpp" #include "lib/rocprofiler-sdk/buffer.hpp" @@ -37,7 +27,18 @@ #include "lib/rocprofiler-sdk/counters/id_decode.hpp" #include "lib/rocprofiler-sdk/counters/metrics.hpp" #include "lib/rocprofiler-sdk/registration.hpp" -#include "rocprofiler-sdk/registration.h" + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include using namespace rocprofiler::counters; @@ -125,13 +126,10 @@ buffered_callback(rocprofiler_context_id_t, {} void -dispatch_callback(rocprofiler_queue_id_t, - const rocprofiler_agent_t*, - rocprofiler_correlation_id_t, - const hsa_kernel_dispatch_packet_t*, - uint64_t, - void*, - rocprofiler_profile_config_id_t*) +dispatch_callback(rocprofiler_profile_counting_dispatch_data_t, + rocprofiler_profile_config_id_t*, + rocprofiler_user_data_t*, + void*) {} rocprofiler_context_id_t& diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/code_object.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/code_object.cpp index 1cfec5e377..026e501826 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/code_object.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/code_object.cpp @@ -166,12 +166,131 @@ get_names() namespace { -using hsa_loader_table_t = hsa_ven_amd_loader_1_01_pfn_t; -using context_t = context::context; -using user_data_t = rocprofiler_user_data_t; -using context_array_t = context::context_array_t; -using context_user_data_map_t = std::unordered_map; -using name_array_t = std::vector>>; +using hsa_loader_table_t = hsa_ven_amd_loader_1_01_pfn_t; +using context_t = context::context; +using user_data_t = rocprofiler_user_data_t; +using context_array_t = context::context_array_t; +using context_user_data_map_t = std::unordered_map; +using name_array_t = std::vector>>; +using amd_compute_pgm_rsrc_three32_t = uint32_t; + +struct kernel_descriptor_t +{ + uint8_t reserved0[16]; + int64_t kernel_code_entry_byte_offset; + uint8_t reserved1[20]; + uint32_t compute_pgm_rsrc3; + uint32_t compute_pgm_rsrc1; + uint32_t compute_pgm_rsrc2; + uint16_t kernel_code_properties; + uint8_t reserved2[6]; +}; + +// AMD Compute Program Resource Register Three. +enum amd_compute_gfx9_pgm_rsrc_three_t +{ + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_ACCUM_OFFSET, 0, 5), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_TG_SPLIT, 16, 1) +}; + +enum amd_compute_gfx10_gfx11_pgm_rsrc_three_t +{ + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_SHARED_VGPR_COUNT, 0, 4), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_INST_PREF_SIZE, 4, 6), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_TRAP_ON_START, 10, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_TRAP_ON_END, 11, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_IMAGE_OP, 31, 1) +}; + +// Kernel code properties. +enum amd_kernel_code_property_t +{ + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, + 0, + 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, 1, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, 2, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR, + 3, + 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, 4, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, + 6, + 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_RESERVED0, 7, 3), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32, + 10, + 1), // GFX10+ + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK, 11, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_RESERVED1, 12, 4), +}; + +uint32_t +arch_vgpr_count(std::string_view name, kernel_descriptor_t kernel_code) +{ + if(name == "gfx90a" || name.find("gfx94") == 0) + return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc3, + AMD_COMPUTE_PGM_RSRC_THREE_ACCUM_OFFSET) + + 1) * + 4; + + return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1, + AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT) + + 1) * + (AMD_HSA_BITS_GET(kernel_code.kernel_code_properties, + AMD_KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32) + ? 8 + : 4); +} + +uint32_t +accum_vgpr_count(std::string_view name, kernel_descriptor_t kernel_code) +{ + if(name == "gfx908") + return arch_vgpr_count(name, kernel_code); + else if(name == "gfx90a" || name.find("gfx94") == 0) + return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1, + AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT) + + 1) * + (8 - arch_vgpr_count(name, kernel_code)); + + LOG(WARNING) << "Missing support for accum_vgpr_count for " << name; + return 0; +} + +uint32_t +sgpr_count(std::string_view name, kernel_descriptor_t kernel_code) +{ + // GFX10 and later always allocate 128 sgprs. + constexpr uint32_t gfx10_sgprs = 128; + + auto begp = name.find_first_of("0123456789"); + if(!name.empty() && begp != std::string_view::npos) + { + auto endp = name.find_first_not_of("0123456789", begp); + auto lenp = (endp - begp) + 1; + auto gfxip_str = name.substr(begp, lenp); + auto gfxip_n = int32_t{0}; + if(!gfxip_str.empty()) gfxip_n = std::stoi(std::string{gfxip_str}); + + if(gfxip_n >= 1000) + { + return gfx10_sgprs; + } + else + { + return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1, + AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT) / + 2 + + 1) * + 16; + } + } + + LOG(WARNING) << "Missing support for sgpr_count for " << name; + return 0; +} name_array_t* get_string_array() @@ -212,6 +331,41 @@ get_loader_table() return _v; } +auto*& +get_status_string_function() +{ + static decltype(::hsa_status_string)* _v = nullptr; + return _v; +} + +std::string_view +get_status_string(hsa_status_t _status) +{ + const char* _msg = nullptr; + if(get_status_string_function() && + get_status_string_function()(_status, &_msg) == HSA_STATUS_SUCCESS && _msg) + return std::string_view{_msg}; + + return std::string_view{"(unknown HSA error)"}; +} + +const kernel_descriptor_t* +get_kernel_descriptor(uint64_t kernel_object) +{ + const kernel_descriptor_t* kernel_code = nullptr; + if(get_loader_table().hsa_ven_amd_loader_query_host_address == nullptr) return kernel_code; + hsa_status_t status = get_loader_table().hsa_ven_amd_loader_query_host_address( + reinterpret_cast(kernel_object), // NOLINT(performance-no-int-to-ptr) + reinterpret_cast(&kernel_code)); + if(status == HSA_STATUS_SUCCESS) return kernel_code; + + LOG(WARNING) << "hsa_ven_amd_loader_query_host_address(kernel_object=" << kernel_object + << ") returned " << status << ": " << get_status_string(status); + + // NOLINTNEXTLINE(performance-no-int-to-ptr) + return reinterpret_cast(kernel_object); +} + struct kernel_symbol { using kernel_symbol_data_t = @@ -441,6 +595,19 @@ executable_iterate_agent_symbols_load_callback(hsa_executable_t executabl ROCP_HSA_CORE_GET_EXE_SYMBOL_INFO(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &data.private_segment_size); + // This works for gfx9 but may not for Navi arch + const auto* kernel_descript = get_kernel_descriptor(data.kernel_object); + if(CHECK_NOTNULL(code_obj_v) && CHECK_NOTNULL(kernel_descript)) + { + const auto* rocp_agent = agent::get_agent(code_obj_v->rocp_data.rocp_agent); + if(CHECK_NOTNULL(rocp_agent)) + { + data.arch_vgpr_count = arch_vgpr_count(rocp_agent->name, *kernel_descript); + data.accum_vgpr_count = accum_vgpr_count(rocp_agent->name, *kernel_descript); + data.sgpr_count = sgpr_count(rocp_agent->name, *kernel_descript); + } + } + // if we have reached this point (i.e. there were no HSA errors returned within macro) then we // generate a unique kernel symbol id data.kernel_id = ++get_kernel_symbol_id(); @@ -905,10 +1072,13 @@ code_object_init(HsaApiTable* table) { auto& core_table = *table->core_; + get_status_string_function() = core_table.hsa_status_string_fn; + auto _status = core_table.hsa_system_get_major_extension_table_fn( HSA_EXTENSION_AMD_LOADER, 1, sizeof(hsa_loader_table_t), &get_loader_table()); - LOG_IF(ERROR, _status != HSA_STATUS_SUCCESS) << "hsa_system_get_major_extension_table failed"; + LOG_IF(ERROR, _status != HSA_STATUS_SUCCESS) + << "hsa_system_get_major_extension_table failed: " << get_status_string(_status); if(_status == HSA_STATUS_SUCCESS) { diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.cpp index 68628b8b06..468c359fc6 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.cpp @@ -253,8 +253,9 @@ WriteInterceptor(const void* packets, return; } - auto thr_id = common::get_tid(); - auto* corr_id = context::get_latest_correlation_id(); + auto thr_id = common::get_tid(); + auto* corr_id = context::get_latest_correlation_id(); + auto user_data = rocprofiler_user_data_t{.value = 0}; // use thread-local value to reuse allocation auto extern_corr_ids = Queue::queue_info_session_t::external_corr_id_map_t{}; @@ -300,8 +301,8 @@ WriteInterceptor(const void* packets, queue.signal_callback([&](const auto& map) { for(const auto& [client_id, cb_pair] : map) { - if(auto maybe_pkt = - cb_pair.first(queue, kernel_pkt, kernel_id, extern_corr_ids, corr_id)) + if(auto maybe_pkt = cb_pair.first( + queue, kernel_pkt, kernel_id, &user_data, extern_corr_ids, corr_id)) { inst_pkt.push_back(std::make_pair(std::move(maybe_pkt), client_id)); } @@ -381,6 +382,7 @@ WriteInterceptor(const void* packets, .tid = thr_id, .kernel_id = kernel_id, .queue_id = queue.get_id(), + .user_data = user_data, .hsa_agent = queue.get_agent().get_hsa_agent(), .rocp_agent = queue.get_agent().get_rocp_agent(), .correlation_id = corr_id, diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.hpp index 95f9c05c93..c2bbee1f7b 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.hpp @@ -122,6 +122,7 @@ public: rocprofiler_thread_id_t tid = common::get_tid(); rocprofiler_kernel_id_t kernel_id = 0; rocprofiler_queue_id_t queue_id = {}; + rocprofiler_user_data_t user_data = {.value = 0}; hsa_agent_t hsa_agent = {}; const rocprofiler_agent_t* rocp_agent = nullptr; context::correlation_id* correlation_id = nullptr; @@ -137,6 +138,7 @@ public: const Queue&, const rocprofiler_packet&, uint64_t, + rocprofiler_user_data_t*, const queue_info_session_t::external_corr_id_map_t&, const context::correlation_id*)>; // Signals the completion of the kernel packet. @@ -220,4 +222,4 @@ Queue::signal_callback(FuncT&& func) const } } // namespace hsa -} // namespace rocprofiler \ No newline at end of file +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/profile_config.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/profile_config.cpp index 6dd69228ca..4a37337d05 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/profile_config.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/profile_config.cpp @@ -20,7 +20,7 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include +#include #include #include "lib/common/synchronized.hpp" @@ -31,7 +31,8 @@ #include "lib/rocprofiler-sdk/counters/evaluate_ast.hpp" #include "lib/rocprofiler-sdk/counters/metrics.hpp" #include "lib/rocprofiler-sdk/hsa/agent_cache.hpp" -#include "rocprofiler-sdk/fwd.h" + +#include extern "C" { /** diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp index aa1c6d7958..b53f2ab795 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/registration.cpp @@ -235,14 +235,15 @@ find_clients() { for(const auto& itr : env) { - LOG(INFO) << "searching " << itr << " for rocprofiler_configure"; + LOG(INFO) << "[env] searching " << itr << " for rocprofiler_configure"; void* handle = dlopen(itr.c_str(), RTLD_NOLOAD | RTLD_LAZY); if(!handle) { - LOG(INFO) << itr << " is not already loaded, doing a global lazy dlopen..."; - handle = dlopen(itr.c_str(), RTLD_GLOBAL | RTLD_LAZY); + LOG(WARNING) << "[env] " << itr + << " is not already loaded, doing a local lazy dlopen..."; + handle = dlopen(itr.c_str(), RTLD_LOCAL | RTLD_LAZY); } if(!handle) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/shared_library.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/shared_library.cpp new file mode 100644 index 0000000000..9d85b42958 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/shared_library.cpp @@ -0,0 +1,84 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "lib/common/environment.hpp" +#include "lib/common/logging.hpp" +#include "lib/common/static_object.hpp" +#include "lib/rocprofiler-sdk/allocator.hpp" +#include "lib/rocprofiler-sdk/registration.hpp" + +#include + +namespace rocprofiler +{ +namespace shared_library +{ +namespace +{ +struct lifetime +{ + lifetime(); + ~lifetime(); +}; + +lifetime::lifetime() +{ + registration::init_logging(); + + if(common::get_env("ROCPROFILER_LIBRARY_CTOR", false)) + { + LOG(INFO) << "Initializing rocprofiler-sdk library..."; + registration::initialize(); + LOG(INFO) << "rocprofiler-sdk library initialized"; + } +} + +lifetime::~lifetime() +{ + if(common::get_env("ROCPROFILER_LIBRARY_DTOR", false)) + { + LOG(INFO) << "Finalizing rocprofiler-sdk library..."; + registration::finalize(); + LOG(INFO) << "rocprofiler-sdk library finalized"; + } +} + +auto*& +get_lifetime() +{ + static auto* _v = common::static_object::construct(); + return _v; +} +} // namespace +} // namespace shared_library + +auto rocprofiler_sdk_shlib_lifetime = shared_library::get_lifetime(); + +void +rocprofiler_sdk_shlib_ctor() ROCPROFILER_ATTRIBUTE(constructor(101)); + +void +rocprofiler_sdk_shlib_ctor() +{ + (void) shared_library::get_lifetime(); +} +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/tests/common.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/tests/common.hpp index 328eb6b52a..a14e9e9e9d 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/tests/common.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/tests/common.hpp @@ -22,11 +22,11 @@ #pragma once +#include #include #include #include "lib/common/defines.hpp" -#include "rocprofiler-sdk/fwd.h" #include diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/tests/roctx.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/tests/roctx.cpp index 3590925f8e..a84b510d3e 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/tests/roctx.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/tests/roctx.cpp @@ -20,10 +20,14 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include +#include #include +#include #include +#include +#include #include +#include #include #include @@ -33,13 +37,9 @@ #include "lib/common/units.hpp" #include "lib/common/utility.hpp" #include "lib/rocprofiler-sdk/tests/common.hpp" -#include "rocprofiler-sdk-roctx/api_trace.h" -#include "rocprofiler-sdk-roctx/types.h" -#include "rocprofiler-sdk/callback_tracing.h" -#include "rocprofiler-sdk/context.h" -#include "rocprofiler-sdk/marker/api_id.h" #include +#include #include #include diff --git a/projects/rocprofiler-sdk/tests/CMakeLists.txt b/projects/rocprofiler-sdk/tests/CMakeLists.txt index cfaaee1671..46c930d87a 100644 --- a/projects/rocprofiler-sdk/tests/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/CMakeLists.txt @@ -40,8 +40,11 @@ add_subdirectory(common) # tool libraries used for data collection during integration tests add_subdirectory(tools) +# libraries used by integration test applications +add_subdirectory(lib) + # applications used by integration tests -add_subdirectory(apps) +add_subdirectory(bin) # validation tests add_subdirectory(kernel-tracing) diff --git a/projects/rocprofiler-sdk/tests/async-copy-tracing/CMakeLists.txt b/projects/rocprofiler-sdk/tests/async-copy-tracing/CMakeLists.txt index a5712ceb02..bfa8ab16d5 100644 --- a/projects/rocprofiler-sdk/tests/async-copy-tracing/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/async-copy-tracing/CMakeLists.txt @@ -29,7 +29,8 @@ set(async-copy-tracing-env set_tests_properties( test-async-copy-tracing-execute PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT - "${async-copy-tracing-env}" FAIL_REGULAR_EXPRESSION "threw an exception") + "${async-copy-tracing-env}" FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}") foreach(FILENAME validate.py pytest.ini conftest.py) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME} @@ -44,4 +45,4 @@ set_tests_properties( test-async-copy-tracing-validate PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS test-async-copy-tracing-execute FAIL_REGULAR_EXPRESSION - "threw an exception") + "${ROCPROFILER_DEFAULT_FAIL_REGEX}") diff --git a/projects/rocprofiler-sdk/tests/apps/CMakeLists.txt b/projects/rocprofiler-sdk/tests/bin/CMakeLists.txt similarity index 88% rename from projects/rocprofiler-sdk/tests/apps/CMakeLists.txt rename to projects/rocprofiler-sdk/tests/bin/CMakeLists.txt index e478c4db94..52378d8b3d 100644 --- a/projects/rocprofiler-sdk/tests/apps/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/bin/CMakeLists.txt @@ -3,7 +3,7 @@ # cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) -project(rocprofiler-test-apps LANGUAGES C CXX) +project(rocprofiler-tests-bin LANGUAGES C CXX) set(CMAKE_BUILD_RPATH "\$ORIGIN:\$ORIGIN/../lib") @@ -11,6 +11,7 @@ set(CMAKE_BUILD_RPATH "\$ORIGIN:\$ORIGIN/../lib") add_subdirectory(simple-transpose) add_subdirectory(multistream) add_subdirectory(vector-operations) +add_subdirectory(hip-in-libraries) set(CMAKE_BUILD_RPATH "\$ORIGIN:\$ORIGIN/../lib:$" diff --git a/projects/rocprofiler-sdk/tests/bin/hip-in-libraries/CMakeLists.txt b/projects/rocprofiler-sdk/tests/bin/hip-in-libraries/CMakeLists.txt new file mode 100644 index 0000000000..b01f354467 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/bin/hip-in-libraries/CMakeLists.txt @@ -0,0 +1,34 @@ +# +# +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +project(rocprofiler-tests-bin-hip-in-libraries LANGUAGES CXX) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +add_executable(hip-in-libraries) +target_sources(hip-in-libraries PRIVATE hip-in-libraries.cpp) +target_compile_options(hip-in-libraries PRIVATE -W -Wall -Wextra -Wpedantic -Wshadow + -Werror) +target_link_libraries(hip-in-libraries PRIVATE transpose-shared-library + vector-ops-shared-library) + +find_package(hip REQUIRED) +target_link_libraries(hip-in-libraries PRIVATE hip::host) + +find_package(Threads REQUIRED) +target_link_libraries(hip-in-libraries PRIVATE Threads::Threads) + +if(TRANSPOSE_USE_MPI) + find_package(MPI REQUIRED) + target_compile_definitions(hip-in-libraries PRIVATE USE_MPI) + target_link_libraries(hip-in-libraries PRIVATE MPI::MPI_C) +endif() + +install( + TARGETS hip-in-libraries + DESTINATION bin + COMPONENT tests) diff --git a/projects/rocprofiler-sdk/tests/bin/hip-in-libraries/hip-in-libraries.cpp b/projects/rocprofiler-sdk/tests/bin/hip-in-libraries/hip-in-libraries.cpp new file mode 100644 index 0000000000..9e78ec7692 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/bin/hip-in-libraries/hip-in-libraries.cpp @@ -0,0 +1,151 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "transpose.hpp" +#include "vector-ops.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include + +#if defined(USE_MPI) +# include +#endif + +#define HIP_API_CALL(CALL) \ + { \ + hipError_t error_ = (CALL); \ + if(error_ != hipSuccess) \ + { \ + auto _hip_api_print_lk = auto_lock_t{print_lock}; \ + fprintf(stderr, \ + "%s:%d :: HIP error : %s\n", \ + __FILE__, \ + __LINE__, \ + hipGetErrorString(error_)); \ + throw std::runtime_error("hip_api_call"); \ + } \ + } + +namespace +{ +using auto_lock_t = std::unique_lock; +auto print_lock = std::mutex{}; +size_t nqueues = 8; +size_t nthreads = 4; +size_t nitr = 500; +size_t nsync = 10; +} // namespace + +int +main(int argc, char** argv) +{ + int rank = 0; + int size = 1; + +#if defined(USE_MPI) + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); +#else + (void) size; +#endif + + for(int i = 1; i < argc; ++i) + { + auto _arg = std::string{argv[i]}; + if(_arg == "?" || _arg == "-h" || _arg == "--help") + { + if(rank == 0) + { + fprintf(stderr, + "usage: hip-in-libraries [NUM_QUEUES (%zu)] [NUM_THREADS (%zu)] " + "[NUM_ITERATION (%zu)] " + "[SYNC_EVERY_N_ITERATIONS (%zu)]\n", + nqueues, + nthreads, + nitr, + nsync); + } + exit(EXIT_SUCCESS); + } + } + + if(argc > 1) nqueues = atoll(argv[1]); + if(argc > 2) nthreads = atoll(argv[2]); + if(argc > 3) nitr = atoll(argv[3]); + if(argc > 4) nsync = atoll(argv[4]); + + int ndevice = 0; + HIP_API_CALL(hipGetDeviceCount(&ndevice)); + + printf("[hip-in-libraries] Number of devices found: %i\n", ndevice); + printf("[hip-in-libraries] Number of queues: %zu\n", nqueues); + printf("[hip-in-libraries] Number of threads: %zu\n", nthreads); + printf("[hip-in-libraries] Number of iterations: %zu\n", nitr); + printf("[hip-in-libraries] Syncing every %zu iterations\n", nsync); + + { + auto vector_ops_thread = std::thread{run_vector_ops, nthreads, nqueues}; + auto transpose_thread = std::thread{run_transpose, nthreads, nitr, nsync}; + + vector_ops_thread.join(); + transpose_thread.join(); + } + + // this is a temporary workaround in omnitrace when HIP + MPI is enabled + +#if defined(USE_MPI) + MPI_Barrier(MPI_COMM_WORLD); +#endif + + for(int i = 0; i < ndevice; ++i) + { + HIP_API_CALL(hipSetDevice(i)); + HIP_API_CALL(hipDeviceSynchronize()); + } + +#if defined(USE_MPI) + MPI_Barrier(MPI_COMM_WORLD); +#endif + + if(rank == 0) + { + for(int i = 0; i < ndevice; ++i) + { + HIP_API_CALL(hipSetDevice(i)); + HIP_API_CALL(hipDeviceReset()); + } + } + +#if defined(USE_MPI) + MPI_Barrier(MPI_COMM_WORLD); +#endif + + return 0; +} diff --git a/projects/rocprofiler-sdk/tests/apps/multistream/CMakeLists.txt b/projects/rocprofiler-sdk/tests/bin/multistream/CMakeLists.txt similarity index 95% rename from projects/rocprofiler-sdk/tests/apps/multistream/CMakeLists.txt rename to projects/rocprofiler-sdk/tests/bin/multistream/CMakeLists.txt index 67144e8353..0ced0f632e 100644 --- a/projects/rocprofiler-sdk/tests/apps/multistream/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/bin/multistream/CMakeLists.txt @@ -17,7 +17,7 @@ if(NOT CMAKE_HIP_COMPILER) endif() endif() -project(rocprofiler-test-app-multistream LANGUAGES CXX HIP) +project(rocprofiler-tests-bin-multistream LANGUAGES CXX HIP) foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "") diff --git a/projects/rocprofiler-sdk/tests/apps/multistream/multistream_app.cpp b/projects/rocprofiler-sdk/tests/bin/multistream/multistream_app.cpp similarity index 100% rename from projects/rocprofiler-sdk/tests/apps/multistream/multistream_app.cpp rename to projects/rocprofiler-sdk/tests/bin/multistream/multistream_app.cpp diff --git a/projects/rocprofiler-sdk/tests/apps/reproducible-runtime/CMakeLists.txt b/projects/rocprofiler-sdk/tests/bin/reproducible-runtime/CMakeLists.txt similarity index 96% rename from projects/rocprofiler-sdk/tests/apps/reproducible-runtime/CMakeLists.txt rename to projects/rocprofiler-sdk/tests/bin/reproducible-runtime/CMakeLists.txt index 23ee5583a4..ea66c56c1f 100644 --- a/projects/rocprofiler-sdk/tests/apps/reproducible-runtime/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/bin/reproducible-runtime/CMakeLists.txt @@ -17,7 +17,7 @@ if(NOT CMAKE_HIP_COMPILER) endif() endif() -project(rocprofiler-test-app-reproducible-runtime LANGUAGES CXX HIP) +project(rocprofiler-tests-bin-reproducible-runtime LANGUAGES CXX HIP) foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "") diff --git a/projects/rocprofiler-sdk/tests/apps/reproducible-runtime/reproducible-runtime.cpp b/projects/rocprofiler-sdk/tests/bin/reproducible-runtime/reproducible-runtime.cpp similarity index 100% rename from projects/rocprofiler-sdk/tests/apps/reproducible-runtime/reproducible-runtime.cpp rename to projects/rocprofiler-sdk/tests/bin/reproducible-runtime/reproducible-runtime.cpp diff --git a/projects/rocprofiler-sdk/tests/apps/simple-transpose/CMakeLists.txt b/projects/rocprofiler-sdk/tests/bin/simple-transpose/CMakeLists.txt similarity index 95% rename from projects/rocprofiler-sdk/tests/apps/simple-transpose/CMakeLists.txt rename to projects/rocprofiler-sdk/tests/bin/simple-transpose/CMakeLists.txt index 6ad5b7d54c..7d2b3240d7 100644 --- a/projects/rocprofiler-sdk/tests/apps/simple-transpose/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/bin/simple-transpose/CMakeLists.txt @@ -17,7 +17,7 @@ if(NOT CMAKE_HIP_COMPILER) endif() endif() -project(rocprofiler-tool-test-app-transpose LANGUAGES CXX HIP) +project(rocprofiler-tests-bin-transpose LANGUAGES CXX HIP) foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "") diff --git a/projects/rocprofiler-sdk/tests/apps/simple-transpose/simple-transpose.cpp b/projects/rocprofiler-sdk/tests/bin/simple-transpose/simple-transpose.cpp similarity index 100% rename from projects/rocprofiler-sdk/tests/apps/simple-transpose/simple-transpose.cpp rename to projects/rocprofiler-sdk/tests/bin/simple-transpose/simple-transpose.cpp diff --git a/projects/rocprofiler-sdk/tests/apps/transpose/CMakeLists.txt b/projects/rocprofiler-sdk/tests/bin/transpose/CMakeLists.txt similarity index 96% rename from projects/rocprofiler-sdk/tests/apps/transpose/CMakeLists.txt rename to projects/rocprofiler-sdk/tests/bin/transpose/CMakeLists.txt index 7835468902..e9cebfeba5 100644 --- a/projects/rocprofiler-sdk/tests/apps/transpose/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/bin/transpose/CMakeLists.txt @@ -17,7 +17,7 @@ if(NOT CMAKE_HIP_COMPILER) endif() endif() -project(rocprofiler-test-app-transpose LANGUAGES CXX HIP) +project(rocprofiler-tests-bin-transpose LANGUAGES CXX HIP) foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "") diff --git a/projects/rocprofiler-sdk/tests/apps/transpose/transpose.cpp b/projects/rocprofiler-sdk/tests/bin/transpose/transpose.cpp similarity index 96% rename from projects/rocprofiler-sdk/tests/apps/transpose/transpose.cpp rename to projects/rocprofiler-sdk/tests/bin/transpose/transpose.cpp index fb52c8542c..774edec48b 100644 --- a/projects/rocprofiler-sdk/tests/apps/transpose/transpose.cpp +++ b/projects/rocprofiler-sdk/tests/bin/transpose/transpose.cpp @@ -189,7 +189,7 @@ run(int rank, int tid, int devid, int argc, char** argv) HIP_API_CALL(hipStreamCreate(&stream)); auto_lock_t _lk{print_lock}; - std::cout << "[" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl; + std::cout << "[transpose][" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl; _lk.unlock(); std::default_random_engine _engine{std::random_device{}() * (rank + 1) * (tid + 1)}; @@ -230,8 +230,10 @@ run(int rank, int tid, int devid, int argc, char** argv) float GB = (float) size * nitr * 2 / (1 << 30); print_lock.lock(); - std::cout << "[" << rank << "][" << tid << "] Runtime of transpose is " << time << " sec\n" - << "The average performance of transpose is " << GB / time << " GBytes/sec" + std::cout << "[transpose][" << rank << "][" << tid << "] Runtime of transpose is " << time + << " sec\n"; + std::cout << "[transpose][" << rank << "][" << tid + << "] The average performance of transpose is " << GB / time << " GBytes/sec" << std::endl; print_lock.unlock(); diff --git a/projects/rocprofiler-sdk/tests/apps/vector-operations/CMakeLists.txt b/projects/rocprofiler-sdk/tests/bin/vector-operations/CMakeLists.txt similarity index 94% rename from projects/rocprofiler-sdk/tests/apps/vector-operations/CMakeLists.txt rename to projects/rocprofiler-sdk/tests/bin/vector-operations/CMakeLists.txt index 5fb0a9c010..43c99ecc4d 100644 --- a/projects/rocprofiler-sdk/tests/apps/vector-operations/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/bin/vector-operations/CMakeLists.txt @@ -17,7 +17,7 @@ if(NOT CMAKE_HIP_COMPILER) endif() endif() -project(rocprofiler-tool-test-app-transpose LANGUAGES CXX HIP) +project(rocprofiler-tests-bin-vector-operations LANGUAGES CXX HIP) foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "") diff --git a/projects/rocprofiler-sdk/tests/apps/vector-operations/vector-ops.cpp b/projects/rocprofiler-sdk/tests/bin/vector-operations/vector-ops.cpp similarity index 75% rename from projects/rocprofiler-sdk/tests/apps/vector-operations/vector-ops.cpp rename to projects/rocprofiler-sdk/tests/bin/vector-operations/vector-ops.cpp index 71f7050df4..b2f41ff415 100644 --- a/projects/rocprofiler-sdk/tests/apps/vector-operations/vector-ops.cpp +++ b/projects/rocprofiler-sdk/tests/bin/vector-operations/vector-ops.cpp @@ -1,24 +1,25 @@ -/* -Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ #include #include #include @@ -132,8 +133,12 @@ divide_kernel(float* __restrict__ a, using namespace std; void -run(int NUM_QUEUE) +run(int NUM_QUEUE, int DEVICE_ID) { + HIP_API_CALL(hipSetDevice(DEVICE_ID)); + + HIP_API_CALL(hipDeviceSynchronize()); + std::vector hostA(NUM_QUEUE); std::vector hostB(NUM_QUEUE); std::vector hostC(NUM_QUEUE); @@ -144,10 +149,18 @@ run(int NUM_QUEUE) std::vector streams(NUM_QUEUE); - hipDeviceProp_t devProp; - HIP_API_CALL(hipGetDeviceProperties(&devProp, 0)); + auto sync_stream = [NUM_QUEUE, streams](int q) { + if(q < 0 || q >= NUM_QUEUE) + throw std::runtime_error{std::string{"invalid stream id: "} + std::to_string(q)}; - int i; + HIP_API_CALL(hipStreamSynchronize(streams.at(q))); + }; + + auto sync_streams = [NUM_QUEUE, sync_stream]() { + for(int i = 0; i < NUM_QUEUE; ++i) + sync_stream(i); + HIP_API_CALL(hipDeviceSynchronize()); + }; for(int q = 0; q < NUM_QUEUE; q++) { @@ -158,26 +171,26 @@ run(int NUM_QUEUE) HIP_API_CALL(hipHostMalloc(&hostC[q], NUM * sizeof(float), 0)); // initialize the input data - for(i = 0; i < NUM; i++) + for(int i = 0; i < NUM; i++) { - hostB[q][i] = (float) i; - hostC[q][i] = (float) i * 100.0f; + hostB[q][i] = static_cast(i); + hostC[q][i] = static_cast(i * 100.0f); } - HIP_API_CALL(hipMalloc((void**) (&deviceA[q]), NUM * sizeof(float))); - HIP_API_CALL(hipMalloc((void**) (&deviceB[q]), NUM * sizeof(float))); - HIP_API_CALL(hipMalloc((void**) (&deviceC[q]), NUM * sizeof(float))); + HIP_API_CALL(hipMallocAsync(&deviceA[q], NUM * sizeof(float), streams[q])); + HIP_API_CALL(hipMallocAsync(&deviceB[q], NUM * sizeof(float), streams[q])); + HIP_API_CALL(hipMallocAsync(&deviceC[q], NUM * sizeof(float), streams[q])); HIP_API_CALL(hipMemcpyAsync( deviceB[q], hostB[q], NUM * sizeof(float), hipMemcpyHostToDevice, streams[q])); HIP_API_CALL(hipMemcpyAsync( deviceC[q], hostC[q], NUM * sizeof(float), hipMemcpyHostToDevice, streams[q])); } - HIP_API_CALL(hipDeviceSynchronize()); - for(int RUN_I = 0; RUN_I < 2; RUN_I++) + sync_streams(); + + for(int q = 0; q < NUM_QUEUE; q++) { - int q = (4 * RUN_I + 0) % NUM_QUEUE; hipLaunchKernelGGL(addition_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), @@ -189,8 +202,8 @@ run(int NUM_QUEUE) WIDTH, HEIGHT); - HIP_API_CALL(hipDeviceSynchronize()); - q = (4 * RUN_I + 1) % NUM_QUEUE; + HIP_API_CALL(hipGetLastError()); + hipLaunchKernelGGL(subtract_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), @@ -202,8 +215,8 @@ run(int NUM_QUEUE) WIDTH, HEIGHT); - HIP_API_CALL(hipDeviceSynchronize()); - q = (4 * RUN_I + 2) % NUM_QUEUE; + HIP_API_CALL(hipGetLastError()); + hipLaunchKernelGGL(multiply_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), @@ -215,8 +228,8 @@ run(int NUM_QUEUE) WIDTH, HEIGHT); - HIP_API_CALL(hipDeviceSynchronize()); - q = (4 * RUN_I + 3) % NUM_QUEUE; + HIP_API_CALL(hipGetLastError()); + hipLaunchKernelGGL(divide_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), @@ -227,17 +240,18 @@ run(int NUM_QUEUE) deviceC[q], WIDTH, HEIGHT); - HIP_API_CALL(hipDeviceSynchronize()); + + HIP_API_CALL(hipGetLastError()); } - for(int q = 0; q < NUM_QUEUE; q++) - HIP_API_CALL(hipMemcpyAsync( - hostA[q], deviceA[q], NUM * sizeof(float), hipMemcpyDeviceToHost, streams[q])); + sync_streams(); for(int q = 0; q < NUM_QUEUE; q++) { - HIP_API_CALL(hipMemcpy(hostA[q], deviceA[q], NUM * sizeof(float), hipMemcpyDeviceToHost)); - HIP_API_CALL(hipDeviceSynchronize()); + HIP_API_CALL(hipMemcpyAsync( + hostA[q], deviceA[q], NUM * sizeof(float), hipMemcpyDeviceToHost, streams[q])); + + sync_stream(q); HIP_API_CALL(hipFree(deviceA[q])); HIP_API_CALL(hipFree(deviceB[q])); @@ -246,13 +260,21 @@ run(int NUM_QUEUE) HIP_API_CALL(hipHostFree(hostA[q])); HIP_API_CALL(hipHostFree(hostB[q])); HIP_API_CALL(hipHostFree(hostC[q])); + HIP_API_CALL(hipStreamDestroy(streams[q])); } + + HIP_API_CALL(hipDeviceSynchronize()); } int main() { - run(1); + int device_count = 0; + HIP_API_CALL(hipGetDeviceCount(&device_count)); + + for(int i = 0; i < device_count; ++i) + run(4, i); + return 0; } diff --git a/projects/rocprofiler-sdk/tests/c-tool/CMakeLists.txt b/projects/rocprofiler-sdk/tests/c-tool/CMakeLists.txt index ef341f32fa..27a67f6f32 100644 --- a/projects/rocprofiler-sdk/tests/c-tool/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/c-tool/CMakeLists.txt @@ -35,4 +35,4 @@ set_tests_properties( PASS_REGULAR_EXPRESSION "Test C tool is using rocprofiler-sdk v([0-9]+\\.[0-9]+\\.[0-9]+)" FAIL_REGULAR_EXPRESSION - "threw an exception") + "${ROCPROFILER_DEFAULT_FAIL_REGEX}") diff --git a/projects/rocprofiler-sdk/tests/common/CMakeLists.txt b/projects/rocprofiler-sdk/tests/common/CMakeLists.txt index 26fc71bb0b..36750706b9 100644 --- a/projects/rocprofiler-sdk/tests/common/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/common/CMakeLists.txt @@ -6,6 +6,11 @@ include(FetchContent) set(FETCHCONTENT_BASE_DIR ${PROJECT_BINARY_DIR}/external) +# default FAIL_REGULAR_EXPRESSION for tests +set(ROCPROFILER_DEFAULT_FAIL_REGEX + "threw an exception|Permission denied|Could not create logging file" + CACHE STRING "Default FAIL_REGULAR_EXPRESSION for tests") + # build flags add_library(rocprofiler-tests-build-flags INTERFACE) add_library(rocprofiler::tests-build-flags ALIAS rocprofiler-tests-build-flags) diff --git a/projects/rocprofiler-sdk/tests/counter-collection/CMakeLists.txt b/projects/rocprofiler-sdk/tests/counter-collection/CMakeLists.txt index e4409a5ad5..9efd75dd67 100644 --- a/projects/rocprofiler-sdk/tests/counter-collection/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/counter-collection/CMakeLists.txt @@ -29,7 +29,7 @@ set_tests_properties( ENVIRONMENT "${PRELOAD_ENV};HSA_TOOLS_LIB=$;ROCPROFILER_TOOL_OUTPUT_FILE=counter-collection-test.json;ROCPROFILER_TOOL_CONTEXTS=COUNTER_COLLECTION;ROCPROF_COUNTERS=SQ_WAVES_sum" FAIL_REGULAR_EXPRESSION - "threw an exception") + "${ROCPROFILER_DEFAULT_FAIL_REGEX}") foreach(FILENAME validate.py pytest.ini conftest.py) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME} @@ -44,4 +44,4 @@ set_tests_properties( test-counter-collection-validate PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS test-counter-collection-execute FAIL_REGULAR_EXPRESSION - "threw an exception") + "${ROCPROFILER_DEFAULT_FAIL_REGEX}") diff --git a/projects/rocprofiler-sdk/tests/kernel-tracing/CMakeLists.txt b/projects/rocprofiler-sdk/tests/kernel-tracing/CMakeLists.txt index 68af6c6e76..74e911fa67 100644 --- a/projects/rocprofiler-sdk/tests/kernel-tracing/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/kernel-tracing/CMakeLists.txt @@ -34,7 +34,7 @@ set(kernel-tracing-env set_tests_properties( test-kernel-tracing-execute PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT "${kernel-tracing-env}" - FAIL_REGULAR_EXPRESSION "threw an exception") + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") foreach(FILENAME validate.py pytest.ini conftest.py) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME} @@ -49,4 +49,4 @@ add_test( set_tests_properties( test-kernel-tracing-validate PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS test-kernel-tracing-execute - FAIL_REGULAR_EXPRESSION "threw an exception") + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") diff --git a/projects/rocprofiler-sdk/tests/lib/CMakeLists.txt b/projects/rocprofiler-sdk/tests/lib/CMakeLists.txt new file mode 100644 index 0000000000..cc17a27f5f --- /dev/null +++ b/projects/rocprofiler-sdk/tests/lib/CMakeLists.txt @@ -0,0 +1,18 @@ +# +# Integration test application libraries +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +project(rocprofiler-tests-lib LANGUAGES C CXX) + +set(CMAKE_BUILD_RPATH "\$ORIGIN:\$ORIGIN/../lib") + +# libraries used by integration test apps which DO NOT link to rocprofiler-sdk-roctx +add_subdirectory(vector-operations) + +set(CMAKE_BUILD_RPATH + "\$ORIGIN:\$ORIGIN/../lib:$" + ) + +# libraries used by integration test apps which DO link to rocprofiler-sdk-roctx +add_subdirectory(transpose) diff --git a/projects/rocprofiler-sdk/tests/lib/transpose/CMakeLists.txt b/projects/rocprofiler-sdk/tests/lib/transpose/CMakeLists.txt new file mode 100644 index 0000000000..ae69cae770 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/lib/transpose/CMakeLists.txt @@ -0,0 +1,61 @@ +# +# +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +if(NOT CMAKE_HIP_COMPILER) + find_program( + amdclangpp_EXECUTABLE + NAMES amdclang++ + HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATH_SUFFIXES bin llvm/bin NO_CACHE) + mark_as_advanced(amdclangpp_EXECUTABLE) + + if(amdclangpp_EXECUTABLE) + set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}") + endif() +endif() + +project(rocprofiler-tests-lib-transpose-shared-library LANGUAGES CXX HIP) + +foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) + if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "") + set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}") + endif() +endforeach() + +option(TRANSPOSE_USE_MPI "Enable MPI support in transpose-shared-library exe" OFF) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_HIP_STANDARD 17) +set(CMAKE_HIP_EXTENSIONS OFF) +set(CMAKE_HIP_STANDARD_REQUIRED ON) + +set_source_files_properties(transpose.cpp PROPERTIES LANGUAGE HIP) +add_library(transpose-shared-library SHARED) +target_sources(transpose-shared-library PRIVATE transpose.cpp) +target_compile_options(transpose-shared-library PRIVATE -W -Wall -Wextra -Wpedantic + -Wshadow -Werror) +target_include_directories(transpose-shared-library PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +set_target_properties(transpose-shared-library PROPERTIES OUTPUT_NAME transpose) + +find_package(Threads REQUIRED) +target_link_libraries(transpose-shared-library PRIVATE Threads::Threads) + +find_package(rocprofiler-sdk-roctx REQUIRED) +target_link_libraries(transpose-shared-library + PRIVATE rocprofiler-sdk-roctx::rocprofiler-sdk-roctx) + +if(TRANSPOSE_USE_MPI) + find_package(MPI REQUIRED) + target_compile_definitions(transpose-shared-library PRIVATE USE_MPI) + target_link_libraries(transpose-shared-library PRIVATE MPI::MPI_C) +endif() + +install( + TARGETS transpose-shared-library + DESTINATION lib + COMPONENT tests) diff --git a/projects/rocprofiler-sdk/tests/lib/transpose/transpose.cpp b/projects/rocprofiler-sdk/tests/lib/transpose/transpose.cpp new file mode 100644 index 0000000000..9a935c751a --- /dev/null +++ b/projects/rocprofiler-sdk/tests/lib/transpose/transpose.cpp @@ -0,0 +1,260 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "hip/hip_runtime.h" +#include "rocprofiler-sdk-roctx/roctx.h" + +#include +#include +#include +#include +#include +#include +#include + +#if defined(USE_MPI) +# include +#endif + +#define HIP_API_CALL(CALL) \ + { \ + hipError_t error_ = (CALL); \ + if(error_ != hipSuccess) \ + { \ + auto _hip_api_print_lk = auto_lock_t{print_lock}; \ + fprintf(stderr, \ + "%s:%d :: HIP error : %s\n", \ + __FILE__, \ + __LINE__, \ + hipGetErrorString(error_)); \ + throw std::runtime_error("hip_api_call"); \ + } \ + } + +namespace +{ +using auto_lock_t = std::unique_lock; +auto print_lock = std::mutex{}; +constexpr unsigned shared_mem_tile_dim = 32; + +void +check_hip_error(void); + +void +verify(int* in, int* out, int M, int N); + +__global__ void +transpose(const int* in, int* out, int M, int N); + +void +run_transpose_impl(int rank, int tid, int ndevice, size_t nitr, size_t nsync); + +__global__ void +transpose(const int* in, int* out, int M, int N) +{ + __shared__ int tile[shared_mem_tile_dim][shared_mem_tile_dim]; + + int idx = (blockIdx.y * blockDim.y + threadIdx.y) * M + blockIdx.x * blockDim.x + threadIdx.x; + tile[threadIdx.y][threadIdx.x] = in[idx]; + __syncthreads(); + idx = (blockIdx.x * blockDim.x + threadIdx.y) * N + blockIdx.y * blockDim.y + threadIdx.x; + out[idx] = tile[threadIdx.x][threadIdx.y]; +} + +void +run_transpose_impl(int rank, int tid, int devid, size_t nitr, size_t nsync) +{ + roctxRangePush("run_transpose_impl"); + + constexpr unsigned int M = 4960 * 2; + constexpr unsigned int N = 4960 * 2; + + hipStream_t stream = {}; + + printf("[transpose] Rank %i, thread %i assigned to device %i\n", rank, tid, devid); + HIP_API_CALL(hipSetDevice(devid)); + HIP_API_CALL(hipStreamCreate(&stream)); + + auto_lock_t _lk{print_lock}; + std::cout << "[transpose][" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl; + _lk.unlock(); + + std::default_random_engine _engine{std::random_device{}() * (rank + 1) * (tid + 1)}; + std::uniform_int_distribution _dist{0, 1000}; + + size_t size = sizeof(int) * M * N; + int* inp_matrix = new int[size]; + int* out_matrix = new int[size]; + for(size_t i = 0; i < M * N; i++) + { + inp_matrix[i] = _dist(_engine); + out_matrix[i] = 0; + } + int* in = nullptr; + int* out = nullptr; + + HIP_API_CALL(hipMalloc(&in, size)); + HIP_API_CALL(hipMalloc(&out, size)); + HIP_API_CALL(hipMemsetAsync(in, 0, size, stream)); + HIP_API_CALL(hipMemsetAsync(out, 0, size, stream)); + HIP_API_CALL(hipMemcpyAsync(in, inp_matrix, size, hipMemcpyHostToDevice, stream)); + HIP_API_CALL(hipStreamSynchronize(stream)); + + dim3 grid(M / 32, N / 32, 1); + dim3 block(32, 32, 1); // transpose + + auto t1 = std::chrono::high_resolution_clock::now(); + for(size_t i = 0; i < nitr; ++i) + { + transpose<<>>(in, out, M, N); + check_hip_error(); + if(i % nsync == (nsync - 1)) HIP_API_CALL(hipStreamSynchronize(stream)); + } + auto t2 = std::chrono::high_resolution_clock::now(); + HIP_API_CALL(hipStreamSynchronize(stream)); + HIP_API_CALL(hipMemcpyAsync(out_matrix, out, size, hipMemcpyDeviceToHost, stream)); + double time = std::chrono::duration_cast>(t2 - t1).count(); + float GB = (float) size * nitr * 2 / (1 << 30); + + print_lock.lock(); + std::cout << "[transpose][" << rank << "][" << tid << "] Runtime of transpose is " << time + << " sec\n"; + std::cout << "[transpose][" << rank << "][" << tid + << "] The average performance of transpose is " << GB / time << " GBytes/sec" + << std::endl; + print_lock.unlock(); + + HIP_API_CALL(hipStreamSynchronize(stream)); + HIP_API_CALL(hipStreamDestroy(stream)); + + // cpu_transpose(matrix, out_matrix, M, N); + verify(inp_matrix, out_matrix, M, N); + + HIP_API_CALL(hipFree(in)); + HIP_API_CALL(hipFree(out)); + + delete[] inp_matrix; + delete[] out_matrix; + + roctxRangePop(); +} + +void +check_hip_error(void) +{ + hipError_t err = hipGetLastError(); + if(err != hipSuccess) + { + auto_lock_t _lk{print_lock}; + std::cerr << "Error: " << hipGetErrorString(err) << std::endl; + throw std::runtime_error("hip_api_call"); + } +} + +void +verify(int* in, int* out, int M, int N) +{ + for(int i = 0; i < 10; i++) + { + int row = rand() % M; + int col = rand() % N; + if(in[row * N + col] != out[col * M + row]) + { + auto_lock_t _lk{print_lock}; + std::cout << "mismatch: " << row << ", " << col << " : " << in[row * N + col] << " | " + << out[col * M + row] << "\n"; + } + } +} +} // namespace + +void +run_transpose(size_t nthreads, size_t nitr, size_t nsync) +{ + auto range_id = roctxRangeStart("run_transpose"); + + int rank = 0; + int size = 1; + + printf("[transpose] Number of threads: %zu\n", nthreads); + printf("[transpose] Number of iterations: %zu\n", nitr); + printf("[transpose] Syncing every %zu iterations\n", nsync); + +#if defined(USE_MPI) + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); +#else + (void) size; +#endif + // this is a temporary workaround in omnitrace when HIP + MPI is enabled + int ndevice = 0; + HIP_API_CALL(hipGetDeviceCount(&ndevice)); + printf("[transpose] Number of devices found: %i\n", ndevice); + auto devids = std::vector{}; + devids.resize(size * nthreads, 0); + int devid = 0; + for(size_t i = 0; i < nthreads; ++i) + { + for(int j = 0; j < size; ++j) + { + auto idx = (j * nthreads) + i; + devids.at(idx) = devid++ % ndevice; + } + } + auto devid_offset = (rank * nthreads); + auto _threads = std::vector{}; + for(size_t i = 1; i < nthreads; ++i) + _threads.emplace_back( + run_transpose_impl, rank, i, devids.at(devid_offset + i), nitr, nsync); + run_transpose_impl(rank, 0, devids.at(devid_offset + 0), nitr, nsync); + for(auto& itr : _threads) + itr.join(); + +#if defined(USE_MPI) + MPI_Barrier(MPI_COMM_WORLD); +#endif + + // for(int i = 0; i < ndevice; ++i) + // { + // HIP_API_CALL(hipSetDevice(i)); + // HIP_API_CALL(hipDeviceSynchronize()); + // } + + // #if defined(USE_MPI) + // MPI_Barrier(MPI_COMM_WORLD); + // #endif + + // if(rank == 0) + // { + // for(int i = 0; i < ndevice; ++i) + // { + // HIP_API_CALL(hipSetDevice(i)); + // HIP_API_CALL(hipDeviceReset()); + // } + // } + + // #if defined(USE_MPI) + // MPI_Barrier(MPI_COMM_WORLD); + // #endif + + roctxRangeStop(range_id); +} diff --git a/projects/rocprofiler-sdk/tests/lib/transpose/transpose.hpp b/projects/rocprofiler-sdk/tests/lib/transpose/transpose.hpp new file mode 100644 index 0000000000..b83c2f062b --- /dev/null +++ b/projects/rocprofiler-sdk/tests/lib/transpose/transpose.hpp @@ -0,0 +1,28 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include + +void +run_transpose(size_t nthreads, size_t nitr, size_t nsync); diff --git a/projects/rocprofiler-sdk/tests/lib/vector-operations/CMakeLists.txt b/projects/rocprofiler-sdk/tests/lib/vector-operations/CMakeLists.txt new file mode 100644 index 0000000000..e550d8dd76 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/lib/vector-operations/CMakeLists.txt @@ -0,0 +1,49 @@ +# +# +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +if(NOT CMAKE_HIP_COMPILER) + find_program( + amdclangpp_EXECUTABLE + NAMES amdclang++ + HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATH_SUFFIXES bin llvm/bin NO_CACHE) + mark_as_advanced(amdclangpp_EXECUTABLE) + + if(amdclangpp_EXECUTABLE) + set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}") + endif() +endif() + +project(rocprofiler-tests-lib-vector-operations LANGUAGES CXX HIP) + +foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) + if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "") + set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}") + endif() +endforeach() + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_HIP_STANDARD 17) +set(CMAKE_HIP_EXTENSIONS OFF) +set(CMAKE_HIP_STANDARD_REQUIRED ON) + +set_source_files_properties(vector-ops.cpp PROPERTIES LANGUAGE HIP) +add_library(vector-ops-shared-library SHARED) +target_sources(vector-ops-shared-library PRIVATE vector-ops.cpp) +target_compile_options(vector-ops-shared-library PRIVATE -W -Wall -Wextra -Wpedantic + -Wshadow -Werror) +target_include_directories(vector-ops-shared-library PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +set_target_properties(vector-ops-shared-library PROPERTIES OUTPUT_NAME vector-ops) + +find_package(Threads REQUIRED) +target_link_libraries(vector-ops-shared-library PRIVATE Threads::Threads) + +install( + TARGETS vector-ops-shared-library + DESTINATION lib + COMPONENT tests) diff --git a/projects/rocprofiler-sdk/tests/lib/vector-operations/vector-ops.cpp b/projects/rocprofiler-sdk/tests/lib/vector-operations/vector-ops.cpp new file mode 100644 index 0000000000..00914c4fd9 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/lib/vector-operations/vector-ops.cpp @@ -0,0 +1,291 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define HIP_API_CALL(CALL) \ + { \ + hipError_t error_ = (CALL); \ + if(error_ != hipSuccess) \ + { \ + auto _hip_api_print_lk = auto_lock_t{print_lock}; \ + fprintf(stderr, \ + "%s:%d :: HIP error : %s\n", \ + __FILE__, \ + __LINE__, \ + hipGetErrorString(error_)); \ + throw std::runtime_error("hip_api_call"); \ + } \ + } + +namespace +{ +using auto_lock_t = std::unique_lock; +auto print_lock = std::mutex{}; + +constexpr auto WIDTH = (1 << 12); // 4096 +constexpr auto HEIGHT = (1 << 11); // 2048 +constexpr auto DEPTH = (1 << 0); // 1 +constexpr auto NUM = (WIDTH * HEIGHT * DEPTH); + +struct dimensions +{ + int x = 1; + int y = 1; + int z = 1; +}; + +constexpr auto threads_per_block = dimensions{64, 1, 1}; + +// Computes vectorAdd with matrix-multiply +template +__global__ void +addition_kernel(Tp* __restrict__ a, + const Tp* __restrict__ b, + const Tp* __restrict__ c, + int width, + int /*height*/) +{ + // printf("addition kernel\n"); + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; + + if(x >= WIDTH || y >= HEIGHT) return; + int index = y * width + x; + + a[index] = b[index] + c[index]; +} + +template +__global__ void +subtract_kernel(Tp* __restrict__ a, + const Tp* __restrict__ b, + const Tp* __restrict__ c, + int width, + int /*height*/) +{ + // printf("subtract kernel\n"); + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; + + if(x >= WIDTH || y >= HEIGHT) return; + int index = y * width + x; + + a[index] = abs(b[index] - c[index]); +} + +template +__global__ void +multiply_kernel(Tp* __restrict__ a, + const Tp* __restrict__ b, + const Tp* __restrict__ c, + int width, + int /*height*/) +{ + // printf("multiply kernel\n"); + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; + + if(x >= WIDTH || y >= HEIGHT) return; + int index = y * width + x; + + a[index] = (b[index] - 1) * (c[index] - 1) + 1; +} + +template +__global__ void +divide_kernel(Tp* __restrict__ a, + const Tp* __restrict__ b, + const Tp* __restrict__ c, + int width, + int /*height*/) +{ + // printf("divide kernel\n"); + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; + + if(x >= WIDTH || y >= HEIGHT) return; + int index = y * width + x; + + a[index] = (b[index] - c[index]) / abs(c[index] + b[index]) + 1; +} + +void +run_vector_ops_impl(int num_queue, int device_id) +{ + auto t1 = std::chrono::high_resolution_clock::now(); + + HIP_API_CALL(hipSetDevice(device_id)); + + std::vector hostA(num_queue); + std::vector hostB(num_queue); + std::vector hostC(num_queue); + + std::vector deviceA(num_queue); + std::vector deviceB(num_queue); + std::vector deviceC(num_queue); + + std::vector streams(num_queue); + + auto sync_stream = [num_queue, streams](int q) { + if(q < 0 || q >= num_queue) + throw std::runtime_error{std::string{"invalid stream id: "} + std::to_string(q)}; + + HIP_API_CALL(hipStreamSynchronize(streams.at(q))); + }; + + auto sync_streams = [num_queue, sync_stream]() { + for(int i = 0; i < num_queue; ++i) + sync_stream(i); + }; + + for(int q = 0; q < num_queue; q++) + { + HIP_API_CALL(hipStreamCreateWithFlags(&streams[q], hipStreamNonBlocking)); + + HIP_API_CALL(hipHostMalloc(&hostA[q], NUM * sizeof(float), 0)); + HIP_API_CALL(hipHostMalloc(&hostB[q], NUM * sizeof(float), 0)); + HIP_API_CALL(hipHostMalloc(&hostC[q], NUM * sizeof(float), 0)); + + // initialize the input data + for(int i = 0; i < NUM; i++) + { + hostB[q][i] = static_cast(i); + hostC[q][i] = static_cast(i * 100.0f); + } + + HIP_API_CALL(hipMallocAsync(&deviceA[q], NUM * sizeof(float), streams[q])); + HIP_API_CALL(hipMallocAsync(&deviceB[q], NUM * sizeof(float), streams[q])); + HIP_API_CALL(hipMallocAsync(&deviceC[q], NUM * sizeof(float), streams[q])); + + HIP_API_CALL(hipMemcpyAsync( + deviceB[q], hostB[q], NUM * sizeof(float), hipMemcpyHostToDevice, streams[q])); + HIP_API_CALL(hipMemcpyAsync( + deviceC[q], hostC[q], NUM * sizeof(float), hipMemcpyHostToDevice, streams[q])); + } + + sync_streams(); + + for(int q = 0; q < num_queue; q++) + { + hipLaunchKernelGGL(addition_kernel, + dim3(WIDTH / threads_per_block.x, HEIGHT / threads_per_block.y), + dim3(threads_per_block.x, threads_per_block.y), + 0, + streams[q], + deviceA[q], + deviceB[q], + deviceC[q], + WIDTH, + HEIGHT); + + hipLaunchKernelGGL(subtract_kernel, + dim3(WIDTH / threads_per_block.x, HEIGHT / threads_per_block.y), + dim3(threads_per_block.x, threads_per_block.y), + 0, + streams[q], + deviceA[q], + deviceB[q], + deviceC[q], + WIDTH, + HEIGHT); + + hipLaunchKernelGGL(multiply_kernel, + dim3(WIDTH / threads_per_block.x, HEIGHT / threads_per_block.y), + dim3(threads_per_block.x, threads_per_block.y), + 0, + streams[q], + deviceA[q], + deviceB[q], + deviceC[q], + WIDTH, + HEIGHT); + + hipLaunchKernelGGL(divide_kernel, + dim3(WIDTH / threads_per_block.x, HEIGHT / threads_per_block.y), + dim3(threads_per_block.x, threads_per_block.y), + 0, + streams[q], + deviceB[q], + deviceA[q], + deviceC[q], + WIDTH, + HEIGHT); + } + + sync_streams(); + + for(int q = 0; q < num_queue; q++) + { + HIP_API_CALL(hipMemcpyAsync( + hostA[q], deviceA[q], NUM * sizeof(float), hipMemcpyDeviceToHost, streams[q])); + + sync_stream(q); + + HIP_API_CALL(hipFree(deviceA[q])); + HIP_API_CALL(hipFree(deviceB[q])); + HIP_API_CALL(hipFree(deviceC[q])); + + HIP_API_CALL(hipHostFree(hostA[q])); + HIP_API_CALL(hipHostFree(hostB[q])); + HIP_API_CALL(hipHostFree(hostC[q])); + + HIP_API_CALL(hipStreamDestroy(streams[q])); + } + + auto t2 = std::chrono::high_resolution_clock::now(); + double time = std::chrono::duration_cast>(t2 - t1).count(); + + print_lock.lock(); + std::cout << "[vector-ops] Runtime of vector-ops is " << time << " sec\n"; + print_lock.unlock(); +} +} // namespace + +void +run_vector_ops(int num_threads, int num_queue) +{ + int device_count = 0; + HIP_API_CALL(hipGetDeviceCount(&device_count)); + + if(device_count == 0) throw std::runtime_error{"No HIP devices found"}; + + num_threads = std::max(num_threads, 1); + num_queue = std::max(num_queue, 1); + + auto _threads = std::vector{}; + _threads.reserve(num_threads); + + for(int i = 0; i < num_threads; ++i) + _threads.emplace_back(run_vector_ops_impl, num_queue, i % device_count); + + for(auto& itr : _threads) + itr.join(); +} diff --git a/projects/rocprofiler-sdk/tests/lib/vector-operations/vector-ops.hpp b/projects/rocprofiler-sdk/tests/lib/vector-operations/vector-ops.hpp new file mode 100644 index 0000000000..7a1604ae77 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/lib/vector-operations/vector-ops.hpp @@ -0,0 +1,26 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +void +run_vector_ops(int num_threads, int num_queue); diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/CMakeLists.txt b/projects/rocprofiler-sdk/tests/rocprofv3/CMakeLists.txt index 2d2035e631..de533f4c17 100644 --- a/projects/rocprofiler-sdk/tests/rocprofv3/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/rocprofv3/CMakeLists.txt @@ -24,5 +24,6 @@ enable_testing() include(CTest) add_subdirectory(tracing) -add_subdirectory(counter-collection) add_subdirectory(tracing-plus-cc) +add_subdirectory(tracing-hip-in-libraries) +add_subdirectory(counter-collection) diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/CMakeLists.txt b/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/CMakeLists.txt index f26986d80e..04d5e79b50 100644 --- a/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/CMakeLists.txt @@ -1,3 +1,7 @@ +# +# Various counter collection tests +# + foreach(FILENAME conftest.py pytest.ini) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME} ${CMAKE_CURRENT_BINARY_DIR}/${FILENAME} COPYONLY) diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/input1/CMakeLists.txt b/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/input1/CMakeLists.txt index c104d20aa4..097f25587e 100644 --- a/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/input1/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/input1/CMakeLists.txt @@ -40,7 +40,7 @@ set(cc-env-pmc1 set_tests_properties( rocprofv3-test-counter-collection-pmc1-execute PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT "${cc-env-pmc1}" - FAIL_REGULAR_EXPRESSION "threw an exception") + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") add_test(NAME rocprofv3-test-counter-collection-pmc1-validate COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --input @@ -50,4 +50,4 @@ set_tests_properties( rocprofv3-test-counter-collection-pmc1-validate PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS rocprofv3-test-counter-collection-pmc1-execute FAIL_REGULAR_EXPRESSION - "threw an exception") + "${ROCPROFILER_DEFAULT_FAIL_REGEX}") diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/input1/validate.py b/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/input1/validate.py index 5f94e82628..6a465a70bb 100644 --- a/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/input1/validate.py +++ b/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/input1/validate.py @@ -1,30 +1,46 @@ -import pandas as pd +#!/usr/bin/env python3 + import sys import pytest +import numpy as np +import pandas as pd -kernel_list = ["addition_kernel", "subtract_kernel", "multiply_kernel", "divide_kernel"] +kernel_list = sorted( + ["addition_kernel", "subtract_kernel", "multiply_kernel", "divide_kernel"] +) + + +def unique(lst): + return list(set(lst)) def test_validate_counter_collection_pmc1(input_data: pd.DataFrame): df = input_data - assert df.empty == False + assert not df.empty assert (df["Agent_Id"].astype(int).values > 0).all() assert (df["Queue_Id"].astype(int).values > 0).all() assert (df["Process_Id"].astype(int).values > 0).all() - assert len(df["Kernel-Name"]) > 0 - df_list = df["Kernel-Name"].values.flatten().tolist() - # Check if each string in kernel_list is present at least once - missing_kernels = [] - for kernel in kernel_list: - if kernel not in df_list: - missing_kernels.append(kernel) + assert len(df["Kernel_Name"]) > 0 + + assert kernel_list == sorted(df["Kernel_Name"].unique().tolist()) + + kernel_count = dict([[itr, 0] for itr in kernel_list]) + assert len(kernel_count) == len(kernel_list) + for itr in df["Kernel_Name"]: + kernel_count[itr] += 1 + kn_cnt = [itr for _, itr in kernel_count.items()] + assert min(kn_cnt) == max(kn_cnt) and len(unique(kn_cnt)) == 1 - assert ( - not missing_kernels - ), f"The following kernel names are missing from the out file: {missing_kernels}" - assert df["Counter_Name"].str.contains("SQ_WAVES").all() assert len(df["Counter_Value"]) > 0 + assert df["Counter_Name"].str.contains("SQ_WAVES").all() + assert (df["Counter_Value"].astype(int).values > 0).all() + + di_list = df["Dispatch_Id"].astype(int).values.tolist() + di_uniq = sorted(df["Dispatch_Id"].unique().tolist()) + # make sure the dispatch ids are unique and ordered + di_expect = [idx + 1 for idx in range(len(di_list))] + assert di_expect == di_uniq if __name__ == "__main__": diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/input2/CMakeLists.txt b/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/input2/CMakeLists.txt index 5daf371a94..eefd04a20d 100644 --- a/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/input2/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/input2/CMakeLists.txt @@ -40,7 +40,7 @@ set(cc-env-pmc2 set_tests_properties( rocprofv3-test-counter-collection-pmc2-execute PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT "${cc-env-pmc2}" - FAIL_REGULAR_EXPRESSION "threw an exception") + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") add_test(NAME rocprofv3-test-counter-collection-pmc2-validate COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py @@ -50,4 +50,4 @@ set_tests_properties( rocprofv3-test-counter-collection-pmc2-validate PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS rocprofv3-test-counter-collection-pmc2-execute FAIL_REGULAR_EXPRESSION - "threw an exception") + "${ROCPROFILER_DEFAULT_FAIL_REGEX}") diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/input2/validate.py b/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/input2/validate.py index 33290f8d89..534412ff25 100644 --- a/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/input2/validate.py +++ b/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/input2/validate.py @@ -34,7 +34,7 @@ def test_validate_counter_collection_pmc2(input_dir: pd.DataFrame): with open(file_path, "r") as file: df = pd.read_csv(file) # check if kernel-name is present - assert len(df["Kernel-Name"]) > 0 + assert len(df["Kernel_Name"]) > 0 # check if counter value is positive assert len(df["Counter_Value"]) > 0 diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/list_metrics/CMakeLists.txt b/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/list_metrics/CMakeLists.txt index edbb364d55..b660439d1c 100644 --- a/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/list_metrics/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/list_metrics/CMakeLists.txt @@ -41,7 +41,7 @@ set(cc-env-list-metrics set_tests_properties( rocprofv3-test-list-metrics-execute PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT "${cc-env-list-metrics}" - FAIL_REGULAR_EXPRESSION "threw an exception") + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") set_tests_properties( rocprofv3-test-list-metrics-std-out-execute @@ -76,6 +76,6 @@ set_tests_properties( DEPENDS rocprofv3-test-list-metrics-execute FAIL_REGULAR_EXPRESSION - "threw an exception" + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" ATTACHED_FILES_ON_FAIL "${VALIDATION_FILES}") diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/pytest.ini b/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/pytest.ini index e7bc653ac4..324df0a4b5 100644 --- a/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/pytest.ini +++ b/projects/rocprofiler-sdk/tests/rocprofv3/counter-collection/pytest.ini @@ -3,3 +3,4 @@ addopts = --durations=20 -rA -s -vv testpaths = input1/validate.py input2/validate.py + list_metrics/validate.py diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/tracing-hip-in-libraries/CMakeLists.txt b/projects/rocprofiler-sdk/tests/rocprofv3/tracing-hip-in-libraries/CMakeLists.txt new file mode 100644 index 0000000000..0579b18563 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/tracing-hip-in-libraries/CMakeLists.txt @@ -0,0 +1,72 @@ +# +# rocprofv3 tool test +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +project( + rocprofiler-tests-rocprofv3-tracing-hip-in-libraries + LANGUAGES CXX + VERSION 0.0.0) + +find_package(rocprofiler-sdk REQUIRED) + +add_test( + NAME rocprofv3-test-trace-hip-in-libraries-execute + COMMAND + $ --hip-runtime-trace + --hip-compiler-trace --hsa-core-trace --hsa-amd-trace --hsa-image-trace + --hsa-finalizer-trace --kernel-trace --memory-copy-trace -d + ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace -o out $) + +string(REPLACE "LD_PRELOAD=" "ROCPROF_PRELOAD=" PRELOAD_ENV + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}") + +set(tracing-env "${PRELOAD_ENV}" + "HSA_TOOLS_LIB=$") + +set_tests_properties( + rocprofv3-test-trace-hip-in-libraries-execute + PROPERTIES + LABELS + "integration-tests" + ENVIRONMENT + "${tracing-env}" + FAIL_REGULAR_EXPRESSION + "HSA_CORE_API|HSA_AMD_EXT_API|HSA_IMAGE_EXT_API|HSA_FINALIZER_EXT_API|HIP_API|HIP_COMPILER_API|KERNEL_DISPATCH|CODE_OBJECT" + ) + +foreach(FILENAME validate.py conftest.py) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME} + ${CMAKE_CURRENT_BINARY_DIR}/${FILENAME} COPYONLY) +endforeach() + +add_test( + NAME rocprofv3-test-trace-hip-in-libraries-validate + COMMAND + ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --hsa-input + ${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_hsa_api_trace.csv + --hip-input + ${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_hip_api_trace.csv + --kernel-input + ${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_kernel_trace.csv + --memory-copy-input + ${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_memory_copy_trace.csv) + +set(VALIDATION_FILES + ${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_memory_copy_trace.csv + ${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_hsa_api_trace.csv + ${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_hip_api_trace.csv + ${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_kernel_trace.csv) + +set_tests_properties( + rocprofv3-test-trace-hip-in-libraries-validate + PROPERTIES TIMEOUT + 45 + LABELS + "integration-tests" + DEPENDS + rocprofv3-test-trace-hip-in-libraries-execute + FAIL_REGULAR_EXPRESSION + "AssertionError" + ATTACHED_FILES_ON_FAIL + "${VALIDATION_FILES}") diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/tracing-hip-in-libraries/conftest.py b/projects/rocprofiler-sdk/tests/rocprofv3/tracing-hip-in-libraries/conftest.py new file mode 100644 index 0000000000..6ca73a0aaf --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/tracing-hip-in-libraries/conftest.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 + +import os +import csv +import pytest + + +def pytest_addoption(parser): + parser.addoption( + "--hsa-input", + action="store", + help="Path to HSA API tracing CSV file.", + ) + parser.addoption( + "--kernel-input", + action="store", + help="Path to kernel tracing CSV file.", + ) + parser.addoption( + "--memory-copy-input", + action="store", + help="Path to memory-copy tracing CSV file.", + ) + parser.addoption( + "--marker-input", + action="store", + help="Path to marker API tracing CSV file.", + ) + parser.addoption( + "--hip-input", + action="store", + help="Path to HIP runtime and compiler API tracing CSV file.", + ) + + +@pytest.fixture +def hsa_input_data(request): + filename = request.config.getoption("--hsa-input") + data = [] + with open(filename, "r") as inp: + reader = csv.DictReader(inp) + for row in reader: + data.append(row) + + return data + + +@pytest.fixture +def kernel_input_data(request): + filename = request.config.getoption("--kernel-input") + data = [] + with open(filename, "r") as inp: + reader = csv.DictReader(inp) + for row in reader: + data.append(row) + + return data + + +@pytest.fixture +def memory_copy_input_data(request): + filename = request.config.getoption("--memory-copy-input") + data = [] + with open(filename, "r") as inp: + reader = csv.DictReader(inp) + for row in reader: + data.append(row) + + return data + + +@pytest.fixture +def marker_input_data(request): + filename = request.config.getoption("--marker-input") + data = [] + with open(filename, "r") as inp: + reader = csv.DictReader(inp) + for row in reader: + data.append(row) + + return data + + +@pytest.fixture +def hip_input_data(request): + filename = request.config.getoption("--hip-input") + data = [] + if os.path.exists(filename): + with open(filename, "r") as inp: + reader = csv.DictReader(inp) + for row in reader: + data.append(row) + + return data diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/tracing-hip-in-libraries/validate.py b/projects/rocprofiler-sdk/tests/rocprofv3/tracing-hip-in-libraries/validate.py new file mode 100644 index 0000000000..207c599997 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/tracing-hip-in-libraries/validate.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 + +import sys +import pytest + + +class dim3(object): + def __init__(self, x, y, z): + self.x = int(x) + self.y = int(y) + self.z = int(z) + + def as_tuple(self): + return (self.x, self.y, self.z) + + +def test_api_trace(hsa_input_data, hip_input_data): + functions = [] + correlation_ids = [] + for row in hsa_input_data: + assert row["Domain"] in ( + "HSA_CORE_API", + "HSA_AMD_EXT_API", + "HSA_IMAGE_EXT_API", + "HSA_FINALIZE_EXT_API", + ) + assert int(row["Process_Id"]) > 0 + assert int(row["Thread_Id"]) >= int(row["Process_Id"]) + assert int(row["End_Timestamp"]) >= int(row["Start_Timestamp"]) + functions.append(row["Function"]) + correlation_ids.append(int(row["Correlation_Id"])) + + for row in hip_input_data: + assert row["Domain"] in [ + "HIP_RUNTIME_API", + "HIP_COMPILER_API", + ] + assert int(row["Process_Id"]) > 0 + assert int(row["Thread_Id"]) == 0 or int(row["Thread_Id"]) >= int( + row["Process_Id"] + ) + assert int(row["End_Timestamp"]) >= int(row["Start_Timestamp"]) + functions.append(row["Function"]) + correlation_ids.append(int(row["Correlation_Id"])) + + correlation_ids = sorted(list(set(correlation_ids))) + + # all correlation ids are unique + assert len(correlation_ids) == (len(hsa_input_data) + len(hip_input_data)) + # correlation ids are numbered from 1 to N + assert correlation_ids[0] == 1 + assert correlation_ids[-1] == len(correlation_ids) + + functions = list(set(functions)) + for itr in ( + "hsa_amd_memory_async_copy_on_engine", + "hsa_agent_get_info", + "hsa_agent_iterate_isas", + "hsa_signal_create", + "hsa_agent_get_info", + "hsa_executable_symbol_get_info", + ): + assert itr in functions + if hip_input_data: + for itr in ( + "hipGetLastError", + "hipLaunchKernel", + "hipStreamSynchronize", + "hipMemcpyAsync", + "hipFree", + "hipStreamDestroy", + "hipDeviceSynchronize", + "hipDeviceReset", + "hipSetDevice", + ): + assert itr in functions + + +def test_kernel_trace(kernel_input_data): + valid_kernel_names = sorted( + [ + "__amd_rocclr_fillBufferAligned", + "(anonymous namespace)::transpose(int const*, int*, int, int)", + "void (anonymous namespace)::addition_kernel(float*, float const*, float const*, int, int)", + "void (anonymous namespace)::divide_kernel(float*, float const*, float const*, int, int)", + "void (anonymous namespace)::multiply_kernel(float*, float const*, float const*, int, int)", + "void (anonymous namespace)::subtract_kernel(float*, float const*, float const*, int, int)", + ] + ) + + kernels = [] + for row in kernel_input_data: + kernel_name = row["Kernel_Name"] + + assert row["Kind"] == "KERNEL_DISPATCH" + assert int(row["Agent_Id"]) > 0 + assert int(row["Queue_Id"]) > 0 + assert int(row["Kernel_Id"]) > 0 + assert int(row["Correlation_Id"]) > 0 + assert kernel_name in valid_kernel_names + + if kernel_name not in kernels: + kernels.append(kernel_name) + + workgrp_size = dim3( + row["Workgroup_Size_X"], row["Workgroup_Size_Y"], row["Workgroup_Size_Z"] + ) + grid_size = dim3(row["Grid_Size_X"], row["Grid_Size_Y"], row["Grid_Size_Z"]) + + if kernel_name == "__amd_rocclr_fillBufferAligned": + assert workgrp_size.as_tuple() > (1, 1, 1) + assert grid_size.as_tuple() > (1, 1, 1) + elif "transpose" in kernel_name: + assert workgrp_size.as_tuple() == (32, 32, 1) + assert grid_size.as_tuple() == (9920, 9920, 1) + else: + assert workgrp_size.as_tuple() == (64, 1, 1) + assert grid_size.as_tuple() == (4096, 2048, 1) + + assert int(row["End_Timestamp"]) >= int(row["Start_Timestamp"]) + + kernels = sorted(list(set(kernels))) + assert kernels == valid_kernel_names + + +def test_memory_copy_trace(memory_copy_input_data): + for row in memory_copy_input_data: + assert row["Kind"] == "MEMORY_COPY" + assert row["Direction"] in ("HOST_TO_DEVICE", "DEVICE_TO_HOST") + if row["Direction"] == "HOST_TO_DEVICE": + assert int(row["Source_Agent_Id"]) == 0 + elif row["Direction"] == "DEVICE_TO_HOST": + assert int(row["Destination_Agent_Id"]) == 0 + assert int(row["Correlation_Id"]) > 0 + assert int(row["End_Timestamp"]) >= int(row["Start_Timestamp"]) + + assert len(memory_copy_input_data) == 120 + + +if __name__ == "__main__": + exit_code = pytest.main(["-x", __file__] + sys.argv[1:]) + sys.exit(exit_code) diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/tracing-plus-cc/CMakeLists.txt b/projects/rocprofiler-sdk/tests/rocprofv3/tracing-plus-cc/CMakeLists.txt index 99c5298b6d..9ad9fe8ed2 100644 --- a/projects/rocprofiler-sdk/tests/rocprofv3/tracing-plus-cc/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/rocprofv3/tracing-plus-cc/CMakeLists.txt @@ -40,7 +40,7 @@ set(cc-tracing-env set_tests_properties( rocprofv3-test-tracing-plus-cc-execute PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT "${cc-tracing-env}" - FAIL_REGULAR_EXPRESSION "threw an exception") + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") add_test(NAME rocprofv3-test-tracing-plus-cc-validate COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py @@ -50,4 +50,4 @@ set_tests_properties( rocprofv3-test-tracing-plus-cc-validate PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS rocprofv3-test-tracing-plus-cc-execute FAIL_REGULAR_EXPRESSION - "threw an exception") + "${ROCPROFILER_DEFAULT_FAIL_REGEX}") diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/tracing-plus-cc/validate.py b/projects/rocprofiler-sdk/tests/rocprofv3/tracing-plus-cc/validate.py index 43548661da..e35a8c7cd8 100644 --- a/projects/rocprofiler-sdk/tests/rocprofv3/tracing-plus-cc/validate.py +++ b/projects/rocprofiler-sdk/tests/rocprofv3/tracing-plus-cc/validate.py @@ -34,7 +34,7 @@ def test_validate_counter_collection_plus_tracing(input_dir: pd.DataFrame): with open(file_path, "r") as file: df = pd.read_csv(file) # check if either kernel-name/FUNCTION is present - assert "Kernel-Name" in df.columns or "Function" in df.columns + assert "Kernel_Name" in df.columns or "Function" in df.columns if __name__ == "__main__": diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/tracing/validate.py b/projects/rocprofiler-sdk/tests/rocprofv3/tracing/validate.py index a9d717f09a..3fb2195c0d 100644 --- a/projects/rocprofiler-sdk/tests/rocprofv3/tracing/validate.py +++ b/projects/rocprofiler-sdk/tests/rocprofv3/tracing/validate.py @@ -23,7 +23,7 @@ def test_hsa_api_trace(hsa_input_data): correlation_ids = sorted(list(set(correlation_ids))) hsa_api_calls_offset = 2 # roctxRangePush is first - num_marker_api_calls = 6 # seven marker API calls, only six entries in + num_marker_api_calls = 7 # seven marker API calls, only six entries in # marker csv data because roctxRangePush + roctxRangePop is one entry # all correlation ids are unique diff --git a/projects/rocprofiler-sdk/tests/tools/json-tool.cpp b/projects/rocprofiler-sdk/tests/tools/json-tool.cpp index 21be82fbed..7dcc94b6cb 100644 --- a/projects/rocprofiler-sdk/tests/tools/json-tool.cpp +++ b/projects/rocprofiler-sdk/tests/tools/json-tool.cpp @@ -526,19 +526,16 @@ counter_collection_buffered(rocprofiler_context_id_t, /*context*/ } void -dispatch_callback(rocprofiler_queue_id_t, /*queue_id*/ - const rocprofiler_agent_t* agent, - rocprofiler_correlation_id_t, /*correlation_id*/ - const hsa_kernel_dispatch_packet_t*, /*dispatch_packet*/ - uint64_t, /*kernel_id*/ - void* /*callback_data_args*/, - rocprofiler_profile_config_id_t* config) +dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data, + rocprofiler_profile_config_id_t* config, + rocprofiler_user_data_t* /*user_data*/, + void* /*callback_data_args*/) { static std::shared_mutex m_mutex = {}; static std::unordered_map profile_cache = {}; auto search_cache = [&]() { - if(auto pos = profile_cache.find(agent->id.handle); pos != profile_cache.end()) + if(auto pos = profile_cache.find(dispatch_data.agent_id.handle); pos != profile_cache.end()) { *config = pos->second; return true; @@ -565,7 +562,7 @@ dispatch_callback(rocprofiler_queue_id_t, /*queue_id*/ // Iterate through the agents and get the counters available on that agent ROCPROFILER_CALL(rocprofiler_iterate_agent_supported_counters( - agent->id, + dispatch_data.agent_id, []([[maybe_unused]] rocprofiler_agent_id_t id, rocprofiler_counter_id_t* counters, size_t num_counters, @@ -600,11 +597,12 @@ dispatch_callback(rocprofiler_queue_id_t, /*queue_id*/ // Create a colleciton profile for the counters rocprofiler_profile_config_id_t profile; - ROCPROFILER_CALL(rocprofiler_create_profile_config( - agent->id, collect_counters.data(), collect_counters.size(), &profile), - "Could not construct profile cfg"); + ROCPROFILER_CALL( + rocprofiler_create_profile_config( + dispatch_data.agent_id, collect_counters.data(), collect_counters.size(), &profile), + "Could not construct profile cfg"); - profile_cache.emplace(agent->id.handle, profile); + profile_cache.emplace(dispatch_data.agent_id.handle, profile); // Return the profile to collect those counters for this dispatch *config = profile; }