Shared Library Constructor (rocprofv3 deadlock fix) (#599)

* Moved tests/apps to tests/bin

* Renamed cmake project in tests/bin

* Update samples

- Use ROCPROFILER_DEFAULT_FAIL_REGEX
- tweaks to stdout messages

* Update tests

- Use ROCPROFILER_DEFAULT_FAIL_REGEX

* Add tests/lib

- libraries with HIP code

* Update PTL submodule

- remove atexit delete of thread_id_map

* Update cmake/rocprofiler_options.cmake

- Set ROCPROFILER_DEFAULT_FAIL_REGEX

* Update common lib: env + logging

- improved customization of logging settings
- default to disabling logging to files
- install failure handler for rocprofv3
- set_env support in environment.*

* Add lib/rocprofiler-sdk/shared_library.cpp

- shared library constructor

* Update lib/rocprofiler-sdk-tool/tool.cpp

- destructor thread safety
- convert callback_name_info and buffered_name_info to pointers
- install failure handler for logging

* Add tests/bin/hip-in-libraries

- hip-in-libraries is an exe which uses two shared libraries where each shared library contains HIP kernels
  - used for testing deadlocking within __hipRegisterFatBinary

* Update bin/rocprofv3

- reorganized the env variables
- use exec to launch command
- set ROCPROFILER_LIBRARY_CTOR=1

* Add tests/rocprofv3/tracing-hip-in-libraries

- uses hip-in-libraries exe for exe which uses shared libraries to launch HIP kernels

* Update bin/rocprofv3

- fix counter collection (no exec)

* Update lib/rocprofiler-sdk-tool/tool.cpp

- replace "Kernel-Name" with "Kernel_Name"

* Update lib/rocprofiler-sdk/registration.cpp

Use RTLD_LOCAL instead of RTLD_GLOBAL for env libraries

* Update tests/rocprofv3

- replace "Kernel-Name" with "Kernel_Name"

* Update tests

- vector-ops (bin) stream syncs + runs with 4 queues per device
- improve counter-collection/input1 validation
- rocprofv3/tracing-hip-in-libraries does not do sys-trace
- improved validation script for tracing-hip-in-libraries
- updated dispatch_callback in json-tool.cpp following reworking of prototypes for counter collection

* Update samples/counter_collection

- updated dispatch_callback(s) and record_callback(s) following reworking of prototypes

* Update bin/rocprofv3

- reorganized help menu
- added options for sub-HSA tables
- added --hip-runtime-trace
- changed --hip-trace to include --hip-compiler-trace

* Update lib/rocprofiler-sdk-tool

- improved kernel filtering
- removed arch_vgpr, accum_vgpr, sgpr code (in rocprofiler-sdk)
- fixed issue with counter-collection w/o tracing
- added support for fine grained HSA API tracing
- removed directly linking to HSA-runtime

* Update lib/rocprofiler-sdk/agent.cpp

- rocp_agents != hsa_agents is non-fatal when ROCPROFILER_BUILD_CI=OFF (CMake option)

* GPR (vector and scalar) info in kernel symbol data

- rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t contains general purpose register info

* Header include order fix

- Include repo headers first
- Third party library headers next
- standard library headers last

* Update dispatch profiling public API

- introduce rocprofiler_profile_counting_dispatch_data_t
- change signature of rocprofiler_profile_counting_dispatch_callback_t and rocprofiler_profile_counting_record_callback_t
- provide rocprofiler_user_data_t pointer in dispatch callback
- provide rocprofiler_user_data_t value (from dispatch cb) in record callback

* Update tests/bin/CMakeLists.txt

- fix add_subdirectory(hip-in-libraries) order

* Update VERSION

- bump to 0.2.0 in prep for AFAR

[ROCm/rocprofiler-sdk commit: 7b6d3c70bd]
This commit is contained in:
Jonathan R. Madsen
2024-03-07 22:21:26 -06:00
committed by GitHub
parent 1295c42022
commit 407fc57ede
85 changed files with 2497 additions and 856 deletions
+1 -1
View File
@@ -1 +1 @@
0.1.0
0.2.0
@@ -131,3 +131,8 @@ if(ASAN)
endif()
include(rocprofiler_memcheck)
# default FAIL_REGULAR_EXPRESSION for tests
set(ROCPROFILER_DEFAULT_FAIL_REGEX
"threw an exception|Permission denied|Could not create logging file"
CACHE STRING "Default FAIL_REGULAR_EXPRESSION for tests")
Submodule projects/rocprofiler-sdk/external/ptl updated: 12ca26ac2b...48df416254
@@ -55,4 +55,4 @@ set_tests_properties(
ENVIRONMENT
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV};HSA_TOOLS_LIB=$<TARGET_FILE:rocprofiler::rocprofiler-shared-library>"
FAIL_REGULAR_EXPRESSION
"threw an exception")
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
@@ -155,7 +155,7 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv)
if(argc > 3) nsync = atoll(argv[3]);
auto_lock_t _lk{print_lock};
std::cout << "[" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl;
std::cout << "[transpose][" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl;
_lk.unlock();
std::default_random_engine _engine{std::random_device{}() * (rank + 1) * (tid + 1)};
@@ -183,7 +183,7 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv)
dim3 block(32, 32, 1); // transpose_a
print_lock.lock();
printf("[%i][%i] grid=(%i,%i,%i), block=(%i,%i,%i)\n",
printf("[transpose][%i][%i] grid=(%i,%i,%i), block=(%i,%i,%i)\n",
rank,
tid,
grid.x,
@@ -208,8 +208,10 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv)
float GB = (float) size * nitr * 2 / (1 << 30);
print_lock.lock();
std::cout << "[" << rank << "][" << tid << "] Runtime of transpose is " << time << " sec\n"
<< "The average performance of transpose is " << GB / time << " GBytes/sec"
std::cout << "[transpose][" << rank << "][" << tid << "] Runtime of transpose is " << time
<< " sec\n";
std::cout << "[transpose][" << rank << "][" << tid
<< "] The average performance of transpose is " << GB / time << " GBytes/sec"
<< std::endl;
print_lock.unlock();
@@ -57,4 +57,4 @@ set(callback-api-tracing-env
set_tests_properties(
callback-api-tracing
PROPERTIES TIMEOUT 45 LABELS "samples" ENVIRONMENT "${callback-api-tracing-env}"
FAIL_REGULAR_EXPRESSION "threw an exception")
FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
@@ -177,7 +177,7 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv)
if(argc > 3) nsync = atoll(argv[3]);
auto_lock_t _lk{print_lock};
std::cout << "[" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl;
std::cout << "[transpose][" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl;
_lk.unlock();
auto _seed = std::random_device{}() * (rank + 1) * (tid + 1);
@@ -219,8 +219,10 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv)
float GB = (float) size * nitr * 2 / (1 << 30);
print_lock.lock();
std::cout << "[" << rank << "][" << tid << "] Runtime of transpose is " << time << " sec\n"
<< "The average performance of transpose is " << GB / time << " GBytes/sec"
std::cout << "[transpose][" << rank << "][" << tid << "] Runtime of transpose is " << time
<< " sec\n";
std::cout << "[transpose][" << rank << "][" << tid
<< "] The average performance of transpose is " << GB / time << " GBytes/sec"
<< std::endl;
print_lock.unlock();
@@ -55,4 +55,4 @@ set_tests_properties(
ENVIRONMENT
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV};HSA_TOOLS_LIB=$<TARGET_FILE:rocprofiler::rocprofiler-shared-library>"
FAIL_REGULAR_EXPRESSION
"threw an exception")
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
@@ -145,7 +145,7 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv)
if(argc > 3) nsync = atoll(argv[3]);
auto_lock_t _lk{print_lock};
std::cout << "[" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl;
std::cout << "[transpose][" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl;
_lk.unlock();
std::default_random_engine _engine{std::random_device{}() * (rank + 1) * (tid + 1)};
@@ -173,7 +173,7 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv)
dim3 block(32, 32, 1); // transpose_a
print_lock.lock();
printf("[%i][%i] grid=(%i,%i,%i), block=(%i,%i,%i)\n",
printf("[transpose][%i][%i] grid=(%i,%i,%i), block=(%i,%i,%i)\n",
rank,
tid,
grid.x,
@@ -198,8 +198,10 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv)
float GB = (float) size * nitr * 2 / (1 << 30);
print_lock.lock();
std::cout << "[" << rank << "][" << tid << "] Runtime of transpose is " << time << " sec\n"
<< "The average performance of transpose is " << GB / time << " GBytes/sec"
std::cout << "[transpose][" << rank << "][" << tid << "] Runtime of transpose is " << time
<< " sec\n";
std::cout << "[transpose][" << rank << "][" << tid
<< "] The average performance of transpose is " << GB / time << " GBytes/sec"
<< std::endl;
print_lock.unlock();
@@ -2,6 +2,11 @@
# common utilities for samples
#
# default FAIL_REGULAR_EXPRESSION for tests
set(ROCPROFILER_DEFAULT_FAIL_REGEX
"threw an exception|Permission denied|Could not create logging file"
CACHE STRING "Default FAIL_REGULAR_EXPRESSION for tests")
# build flags
add_library(rocprofiler-samples-build-flags INTERFACE)
add_library(rocprofiler::samples-build-flags ALIAS rocprofiler-samples-build-flags)
@@ -52,7 +52,7 @@ set_tests_properties(
ENVIRONMENT
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV};HSA_TOOLS_LIB=$<TARGET_FILE:rocprofiler::rocprofiler-shared-library>"
FAIL_REGULAR_EXPRESSION
"threw an exception")
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
add_library(counter-collection-callback-client SHARED)
target_sources(counter-collection-callback-client PRIVATE callback_client.cpp client.hpp)
@@ -80,7 +80,7 @@ set_tests_properties(
ENVIRONMENT
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV};HSA_TOOLS_LIB=$<TARGET_FILE:rocprofiler::rocprofiler-shared-library>"
FAIL_REGULAR_EXPRESSION
"threw an exception")
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
add_library(counter-collection-functional-counter-client SHARED)
target_sources(counter-collection-functional-counter-client
@@ -109,4 +109,4 @@ set_tests_properties(
ENVIRONMENT
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV};HSA_TOOLS_LIB=$<TARGET_FILE:rocprofiler::rocprofiler-shared-library>"
FAIL_REGULAR_EXPRESSION
"threw an exception")
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
@@ -67,15 +67,14 @@ get_client_ctx()
}
void
record_callback(rocprofiler_queue_id_t,
rocprofiler_agent_id_t,
rocprofiler_correlation_id_t,
uint64_t,
void* callback_data_args,
size_t record_count,
rocprofiler_record_counter_t* record_data)
record_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
rocprofiler_record_counter_t* record_data,
size_t record_count,
rocprofiler_user_data_t user_data,
void* callback_data_args)
{
std::stringstream ss;
ss << "Kernel_id " << dispatch_data.kernel_id << ": ";
for(size_t i = 0; i < record_count; ++i)
{
ss << "(Id: " << record_data[i].id << " Value [D]: " << record_data[i].counter_value
@@ -84,6 +83,8 @@ record_callback(rocprofiler_queue_id_t,
auto* output_stream = static_cast<std::ostream*>(callback_data_args);
if(!output_stream) throw std::runtime_error{"nullptr to output stream"};
*output_stream << "[" << __FUNCTION__ << "] " << ss.str() << "\n";
(void) user_data;
}
/**
@@ -93,13 +94,10 @@ record_callback(rocprofiler_queue_id_t,
* to collect the counter SQ_WAVES for all kernel dispatch packets.
*/
void
dispatch_callback(rocprofiler_queue_id_t /*queue_id*/,
const rocprofiler_agent_t* agent,
rocprofiler_correlation_id_t /*correlation_id*/,
const hsa_kernel_dispatch_packet_t* /*dispatch_packet*/,
uint64_t /*kernel_id*/,
void* /*callback_data_args*/,
rocprofiler_profile_config_id_t* config)
dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
rocprofiler_profile_config_id_t* config,
rocprofiler_user_data_t* /*user_data*/,
void* /*callback_data_args*/)
{
/**
* This simple example uses the same profile counter set for all agents.
@@ -112,7 +110,7 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/,
static std::unordered_map<uint64_t, rocprofiler_profile_config_id_t> profile_cache = {};
auto search_cache = [&]() {
if(auto pos = profile_cache.find(agent->id.handle); pos != profile_cache.end())
if(auto pos = profile_cache.find(dispatch_data.agent_id.handle); pos != profile_cache.end())
{
*config = pos->second;
return true;
@@ -135,7 +133,7 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/,
// Iterate through the agents and get the counters available on that agent
ROCPROFILER_CALL(rocprofiler_iterate_agent_supported_counters(
agent->id,
dispatch_data.agent_id,
[](rocprofiler_agent_id_t,
rocprofiler_counter_id_t* counters,
size_t num_counters,
@@ -169,11 +167,12 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/,
// Create a colleciton profile for the counters
rocprofiler_profile_config_id_t profile;
ROCPROFILER_CALL(rocprofiler_create_profile_config(
agent->id, collect_counters.data(), collect_counters.size(), &profile),
"Could not construct profile cfg");
ROCPROFILER_CALL(
rocprofiler_create_profile_config(
dispatch_data.agent_id, collect_counters.data(), collect_counters.size(), &profile),
"Could not construct profile cfg");
profile_cache.emplace(agent->id.handle, profile);
profile_cache.emplace(dispatch_data.agent_id.handle, profile);
// Return the profile to collect those counters for this dispatch
*config = profile;
}
@@ -32,6 +32,7 @@
#include <unordered_map>
#include <vector>
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/registration.h>
#include <rocprofiler-sdk/rocprofiler.h>
@@ -116,13 +117,10 @@ buffered_callback(rocprofiler_context_id_t,
* to collect the counter SQ_WAVES for all kernel dispatch packets.
*/
void
dispatch_callback(rocprofiler_queue_id_t /*queue_id*/,
const rocprofiler_agent_t* agent,
rocprofiler_correlation_id_t /*correlation_id*/,
const hsa_kernel_dispatch_packet_t* /*dispatch_packet*/,
uint64_t /*kernel_id*/,
void* /*callback_data_args*/,
rocprofiler_profile_config_id_t* config)
dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
rocprofiler_profile_config_id_t* config,
rocprofiler_user_data_t* /*user_data*/,
void* /*callback_data_args*/)
{
/**
* This simple example uses the same profile counter set for all agents.
@@ -135,7 +133,7 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/,
static std::unordered_map<uint64_t, rocprofiler_profile_config_id_t> profile_cache = {};
auto search_cache = [&]() {
if(auto pos = profile_cache.find(agent->id.handle); pos != profile_cache.end())
if(auto pos = profile_cache.find(dispatch_data.agent_id.handle); pos != profile_cache.end())
{
*config = pos->second;
return true;
@@ -158,7 +156,7 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/,
// Iterate through the agents and get the counters available on that agent
ROCPROFILER_CALL(rocprofiler_iterate_agent_supported_counters(
agent->id,
dispatch_data.agent_id,
[](rocprofiler_agent_id_t,
rocprofiler_counter_id_t* counters,
size_t num_counters,
@@ -192,11 +190,12 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/,
// Create a colleciton profile for the counters
rocprofiler_profile_config_id_t profile;
ROCPROFILER_CALL(rocprofiler_create_profile_config(
agent->id, collect_counters.data(), collect_counters.size(), &profile),
"Could not construct profile cfg");
ROCPROFILER_CALL(
rocprofiler_create_profile_config(
dispatch_data.agent_id, collect_counters.data(), collect_counters.size(), &profile),
"Could not construct profile cfg");
profile_cache.emplace(agent->id.handle, profile);
profile_cache.emplace(dispatch_data.agent_id.handle, profile);
// Return the profile to collect those counters for this dispatch
*config = profile;
}
@@ -189,15 +189,46 @@ buffered_callback(rocprofiler_context_id_t,
}
}
void
dispatch_callback(rocprofiler_queue_id_t /*queue_id*/,
const rocprofiler_agent_t* agent,
rocprofiler_correlation_id_t /*correlation_id*/,
const hsa_kernel_dispatch_packet_t* /*dispatch_packet*/,
uint64_t /*kernel_id*/,
void* /*callback_data_args*/,
rocprofiler_profile_config_id_t* config)
using agent_map_t = std::map<uint64_t, const rocprofiler_agent_v0_t*>;
agent_map_t
get_agent_info()
{
auto iterate_cb = [](rocprofiler_agent_version_t agents_ver,
const void** agents_arr,
size_t num_agents,
void* user_data) {
if(agents_ver != ROCPROFILER_AGENT_INFO_VERSION_0)
throw std::runtime_error{"unexpected rocprofiler agent version"};
auto* agents_v = static_cast<agent_map_t*>(user_data);
for(size_t i = 0; i < num_agents; ++i)
{
const auto* itr = static_cast<const rocprofiler_agent_v0_t*>(agents_arr[i]);
agents_v->emplace(itr->id.handle, itr);
}
return ROCPROFILER_STATUS_SUCCESS;
};
auto _agents = agent_map_t{};
ROCPROFILER_CALL(
rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0,
iterate_cb,
sizeof(rocprofiler_agent_t),
const_cast<void*>(static_cast<const void*>(&_agents))),
"query available agents");
return _agents;
}
void
dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
rocprofiler_profile_config_id_t* config,
rocprofiler_user_data_t* /*user_data*/,
void* /*callback_data_args*/)
{
static auto agents = get_agent_info();
auto& cap = *get_capture();
auto wlock = std::unique_lock{cap.m_mutex};
@@ -211,7 +242,7 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/,
{
std::vector<rocprofiler_counter_id_t> counters_needed;
ROCPROFILER_CALL(rocprofiler_iterate_agent_supported_counters(
agent->id,
dispatch_data.agent_id,
[](rocprofiler_agent_id_t,
rocprofiler_counter_id_t* counters,
size_t num_counters,
@@ -237,9 +268,9 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/,
"Could not query counter_id");
cap.expected_counter_names.emplace(found_counter.handle, std::string(version.name));
size_t expected = 0;
ROCPROFILER_CALL(
rocprofiler_query_counter_instance_count(agent->id, found_counter, &expected),
"COULD NOT QUERY INSTANCES");
ROCPROFILER_CALL(rocprofiler_query_counter_instance_count(
dispatch_data.agent_id, found_counter, &expected),
"COULD NOT QUERY INSTANCES");
cap.remaining.push_back(found_counter);
cap.expected.emplace(found_counter.handle, expected);
@@ -266,7 +297,8 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/,
}
if(cap.expected.empty())
{
std::clog << "No counters found for agent - " << agent->name;
std::clog << "No counters found for agent " << dispatch_data.agent_id.handle << " ("
<< agents.at(dispatch_data.agent_id.handle)->name << ")";
}
}
if(cap.remaining.empty()) return;
@@ -274,9 +306,9 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/,
rocprofiler_profile_config_id_t profile;
// Select the next counter to collect.
ROCPROFILER_CALL(
rocprofiler_create_profile_config(agent->id, &(cap.remaining.back()), 1, &profile),
"Could not construct profile cfg");
ROCPROFILER_CALL(rocprofiler_create_profile_config(
dispatch_data.agent_id, &(cap.remaining.back()), 1, &profile),
"Could not construct profile cfg");
cap.remaining.pop_back();
*config = profile;
@@ -54,4 +54,4 @@ set_tests_properties(
ENVIRONMENT
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV};HSA_TOOLS_LIB=$<TARGET_FILE:rocprofiler::rocprofiler-shared-library>"
FAIL_REGULAR_EXPRESSION
"threw an exception")
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
@@ -145,7 +145,7 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv)
if(argc > 3) nsync = atoll(argv[3]);
auto_lock_t _lk{print_lock};
std::cout << "[" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl;
std::cout << "[transpose][" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl;
_lk.unlock();
std::default_random_engine _engine{std::random_device{}() * (rank + 1) * (tid + 1)};
+84 -39
View File
@@ -5,9 +5,16 @@ set -eo pipefail
ROCPROFV3_DIR=$(dirname -- "$(realpath "${BASH_SOURCE[0]}")")
ROCM_DIR=$(dirname -- "${ROCPROFV3_DIR}")
: ${HSA_TOOLS_LIB:="${ROCM_DIR}/lib/librocprofiler-sdk.so"}
: ${ROCPROFILER_LIBRARY_CTOR:=1}
: ${ROCPROF_OUTPUT_PATH:="."}
: ${ROCPROF_OUTPUT_PATH_INTERNAL:="."}
: ${ROCPROF_OUTPUT_FILE_NAME:=""}
: ${ROCPROF_COUNTERS_PATH:=""}
: ${ROCPROF_PRELOAD:=""}
: ${ROCPROF_TOOL_LIBRARY:="${ROCM_DIR}/lib/rocprofiler-sdk/librocprofiler-sdk-tool.so"}
: ${ROCPROF_SDK_LIBRARY:="${ROCM_DIR}/lib/librocprofiler-sdk.so"}
export HSA_TOOLS_LIB
export ROCPROFILER_LIBRARY_CTOR
# Define color codes
GREEN='\033[0;32m'
@@ -19,27 +26,40 @@ usage() {
if [ -z "${EC}" ]; then EC=1; fi
echo -e "${RESET}ROCProfilerV3 Run Script Usage:"
echo -e "${GREEN}-h | --help ${RESET} For showing this message"
echo -e ""
echo -e "${GREEN}--hip-trace ${RESET} For Collecting HIP Traces (runtime + compiler)"
echo -e "${GREEN}--hip-runtime-trace ${RESET} For Collecting HIP Runtime API Traces"
echo -e "${GREEN}--hip-compiler-trace ${RESET} For Collecting HIP Compiler generated code Traces"
echo -e ""
echo -e "${GREEN}--marker-trace ${RESET} For Collecting Marker (ROCTx) Traces"
echo -e "${GREEN}--kernel-trace ${RESET} For Collecting Kernel Dispatch Traces"
echo -e "${GREEN}--memory-copy-trace ${RESET} For Collecting Memory Copy Traces"
echo -e ""
echo -e "${GREEN}--hsa-trace ${RESET} For Collecting HSA API Traces (core + amd + image + finalizer)"
echo -e "${GREEN}--hsa-core-trace ${RESET} For Collecting HSA API Traces (core API)"
echo -e "${GREEN}--hsa-amd-trace ${RESET} For Collecting HSA API Traces (AMD-extension API)"
echo -e "${GREEN}--hsa-image-trace ${RESET} For Collecting HSA API Traces (Image-extenson API)"
echo -e "${GREEN}--hsa-finalizer-trace ${RESET} For Collecting HSA API Traces (Finalizer-extension API)"
echo -e ""
echo -e "${GREEN}--sys-trace ${RESET} For Collecting HIP,HSA, Memory Copy, (marker)ROCTx and Kernel dispatch traces\n"
echo -e ""
echo -e "${GREEN}-o | --output-file ${RESET} For the output file name"
echo -e "\t#${GREY} usage (with current dir): rocprofv3 --hsa-trace -o <file_name> <executable>"
echo -e "\t#${GREY} usage (with custom dir): rocprofv3 --hsa-trace -d <out_dir> -o <file_name> <executable>${RESET}\n"
echo -e ""
echo -e "${GREEN}-d | --output-directory ${RESET} For adding output path where the output files will be saved"
echo -e "\t#${GREY} usage (with custom dir): rocprofv3 --hsa-trace -d <out_dir> <executable>${RESET}"
echo -e ""
echo -e "${GREEN}-M | --mangled-kernels ${RESET} Do not demangle the kernel names"
echo -e "${GREEN}-T | --truncate-kernels ${RESET} Truncate the demangled kernel names"
echo -e ""
echo -e "${GREEN}-L | --list-metrics ${RESET} List metrics for counter collection"
echo -e "${GREEN}-i | --input ${RESET} For counter collection "
echo -e "\t#${GREY} Input file .txt format, automatically rerun application for every profiling features line"
echo -e "\t# Perf counters group 1"
echo -e "\tpmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts"
echo -e "\t# Perf counters group 2"
echo -e "\tpmc : WriteSize L2CacheHit ${RESET}"
echo -e "${GREEN}--hsa-trace ${RESET} For Collecting HSA API Traces"
echo -e "${GREEN}--kernel-trace ${RESET} For Collecting Kernel Dispatch Traces"
echo -e "${GREEN}--memory-copy-trace ${RESET} For Collecting Memory Copy Traces"
echo -e "${GREEN}--marker-trace ${RESET} For Collecting Marker (ROCTx) Traces"
echo -e "${GREEN}--hip-trace ${RESET} For Collecting HIP Runtime Traces"
echo -e "${GREEN}--hip-compiler-trace ${RESET} For Collecting HIP Compiler generated code Traces"
echo -e "${GREEN}--sys-trace ${RESET} For Collecting HIP,HSA, Memory Copy, (marker)ROCTx and Kernel dispatch traces\n"
echo -e "${GREEN}-o | --output-file ${RESET} For the output file name"
echo -e "\t#${GREY} usage e.g:(with current dir): rocprofv3 --hsa-trace -o <file_name> <executable>"
echo -e "\t#${GREY} usage e.g:(with custom dir): rocprofv3 --hsa-trace -d <out_dir> -o <file_name> <executable>${RESET}\n"
echo -e "${GREEN}-d | --output-directory ${RESET} For adding output path where the output files will be saved"
echo -e "\t#${GREY} usage e.g:(with custom dir): rocprofv3 --hsa-trace -d <out_dir> <executable>${RESET}"
echo -e "${GREEN}-M | --mangled-kernels ${RESET} Do not demangle the kernel names"
echo -e "${GREEN}-T | --truncate-kernels ${RESET} Truncate the demangled kernel names"
echo -e "${GREEN}-L | --list-metrics ${RESET} List metrics"
echo -e ""
exit ${EC}
}
@@ -48,11 +68,22 @@ if [ -z "$1" ]; then
usage 1
fi
: ${ROCPROF_OUTPUT_PATH:="."}
: ${ROCPROF_OUTPUT_PATH_INTERNAL:="."}
: ${ROCPROF_OUTPUT_FILE_NAME:=""}
: ${ROCPROF_COUNTERS_PATH:=""}
: ${ROCPROF_PRELOAD:=""}
if [ -n "${ROCPROF_PRELOAD}" ]; then
ROCPROF_PRELOAD="${ROCPROF_PRELOAD}:${ROCPROF_TOOL_LIBRARY}:${ROCPROF_SDK_LIBRARY}"
else
ROCPROF_PRELOAD="${ROCPROF_TOOL_LIBRARY}:${ROCPROF_SDK_LIBRARY}"
fi
if [ -n "${ROCP_TOOL_LIBRARIES}" ]; then
ROCP_TOOL_LIBRARIES="${ROCP_TOOL_LIBRARIES}:${ROCPROF_TOOL_LIBRARY}"
else
ROCP_TOOL_LIBRARIES="${ROCPROF_TOOL_LIBRARY}"
fi
LD_LIBRARY_PATH=${ROCM_DIR}/lib:${LD_LIBRARY_PATH}
export ROCP_TOOL_LIBRARIES
export LD_LIBRARY_PATH
while true; do
if [[ "$1" == "-h" || "$1" == "--help" ]]; then
@@ -93,14 +124,26 @@ while true; do
shift
shift
elif [ "$1" == "--hsa-trace" ]; then
export ROCPROF_HSA_API_TRACE=1
export ROCPROF_HSA_CORE_API_TRACE=1
export ROCPROF_HSA_AMD_EXT_API_TRACE=1
export ROCPROF_HSA_IMAGE_EXT_API_TRACE=1
export ROCPROF_HSA_FINALIZER_EXT_API_TRACE=1
shift
elif [ "$1" == "--hsa-core-trace" ]; then
export ROCPROF_HSA_CORE_API_TRACE=1
shift
elif [ "$1" == "--hsa-amd-trace" ]; then
export ROCPROF_HSA_AMD_EXT_API_TRACE=1
shift
elif [ "$1" == "--hsa-image-trace" ]; then
export ROCPROF_HSA_IMAGE_EXT_API_TRACE=1
shift
elif [ "$1" == "--hsa-finalizer-trace" ]; then
export ROCPROF_HSA_FINALIZER_EXT_API_TRACE=1
shift
elif [[ "$1" == "-L" || "$1" == "--list-metrics" ]]; then
export ROCPROF_LIST_METRICS=1
ROCP_TOOL_LIBRARIES="${ROCM_DIR}/lib/rocprofiler-sdk/librocprofiler-sdk-tool.so" \
LD_LIBRARY_PATH=${ROCM_DIR}/lib:${LD_LIBRARY_PATH} \
LD_PRELOAD="${ROCPROF_PRELOAD}:${ROCM_DIR}/lib/librocprofiler-sdk.so" \
exec ${ROCM_DIR}/lib/rocprofiler-sdk/rocprofv3-trigger-list-metrics
LD_PRELOAD="${ROCPROF_PRELOAD}" exec ${ROCM_DIR}/lib/rocprofiler-sdk/rocprofv3-trigger-list-metrics
elif [ "$1" == "--kernel-trace" ]; then
export ROCPROF_KERNEL_TRACE=1
shift
@@ -111,17 +154,25 @@ while true; do
export ROCPROF_MARKER_API_TRACE=1
shift
elif [ "$1" == "--hip-trace" ]; then
export ROCPROF_HIP_API_TRACE=1
export ROCPROF_HIP_RUNTIME_API_TRACE=1
export ROCPROF_HIP_COMPILER_API_TRACE=1
shift
elif [ "$1" == "--hip-runtime-trace" ]; then
export ROCPROF_HIP_RUNTIME_API_TRACE=1
shift
elif [ "$1" == "--hip-compiler-trace" ]; then
export ROCPROF_HIP_COMPILER_API_TRACE=1
shift
elif [ "$1" == "--sys-trace" ]; then
export ROCPROF_HSA_API_TRACE=1
export ROCPROF_HSA_CORE_API_TRACE=1
export ROCPROF_HSA_AMD_EXT_API_TRACE=1
export ROCPROF_HSA_IMAGE_EXT_API_TRACE=1
export ROCPROF_HSA_FINALIZER_EXT_API_TRACE=1
export ROCPROF_HIP_RUNTIME_API_TRACE=1
export ROCPROF_HIP_COMPILER_API_TRACE=1
export ROCPROF_MARKER_API_TRACE=1
export ROCPROF_KERNEL_TRACE=1
export ROCPROF_MEMORY_COPY_TRACE=1
export ROCPROF_MARKER_API_TRACE=1
export ROCPROF_HIP_API_TRACE=1
shift
elif [ "$1" == "--" ]; then
shift
@@ -161,18 +212,12 @@ if [ -n "${PMC_LINES:-}" ]; then
export ROCPROF_OUTPUT_PATH=$RESULT_PATH
fi
((COUNTER++))
ROCP_TOOL_LIBRARIES="${ROCM_DIR}/lib/rocprofiler-sdk/librocprofiler-sdk-tool.so" \
LD_LIBRARY_PATH=${ROCM_DIR}/lib:${LD_LIBRARY_PATH} \
LD_PRELOAD="${ROCPROF_PRELOAD}:${ROCM_DIR}/lib/librocprofiler-sdk.so" \
"${@}"
LD_PRELOAD="${ROCPROF_PRELOAD}" "${@}"
if [ -n "$ROCPROF_OUTPUT_PATH" ]; then
echo -e "\nThe output path for the following counters: $ROCPROF_OUTPUT_PATH"
fi
done
else
# for non counter collection. e.g: tracing
ROCP_TOOL_LIBRARIES="${ROCM_DIR}/lib/rocprofiler-sdk/librocprofiler-sdk-tool.so" \
LD_LIBRARY_PATH=${ROCM_DIR}/lib:${LD_LIBRARY_PATH} \
LD_PRELOAD="${ROCPROF_PRELOAD}:${ROCM_DIR}/lib/librocprofiler-sdk.so" \
"${@}"
LD_PRELOAD="${ROCPROF_PRELOAD}" exec "${@}"
fi
@@ -30,6 +30,8 @@
#include <hsa/hsa_ven_amd_loader.h>
#include <stdint.h>
ROCPROFILER_EXTERN_C_INIT
/**
@@ -141,9 +143,14 @@ typedef struct
uint32_t kernarg_segment_alignment; ///< Alignment (in bytes) of the buffer used to pass
///< arguments to the kernel
uint32_t group_segment_size; ///< Size of static group segment memory required by the kernel
///< (per work-group), in bytes
///< (per work-group), in bytes. AKA: LDS size
uint32_t private_segment_size; ///< Size of static private, spill, and arg segment memory
///< required by this kernel (per work-item), in bytes.
///< required by this kernel (per work-item), in bytes. AKA:
///< scratch size
uint32_t sgpr_count; ///< Scalar general purpose register count
uint32_t arch_vgpr_count; ///< Architecture vector general purpose register count
uint32_t accum_vgpr_count; ///< Accum vector general purpose register count
} rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t;
/**
@@ -37,6 +37,23 @@ ROCPROFILER_EXTERN_C_INIT
* @{
*/
/**
* @brief Kernel dispatch data for profile counting callbacks
*
*/
typedef struct rocprofiler_profile_counting_dispatch_data_t
{
uint64_t size; ///< Size of this struct
rocprofiler_kernel_id_t kernel_id; ///< Kernel identifier
rocprofiler_agent_id_t agent_id; ///< Agent ID where kernel is launched
rocprofiler_queue_id_t queue_id; ///< Queue ID where kernel packet is enqueued
rocprofiler_correlation_id_t correlation_id; ///< Correlation ID for this dispatch
uint32_t private_segment_size; /// runtime private memory segment size
uint32_t group_segment_size; /// runtime group memory segment size
rocprofiler_dim3_t workgroup_size; /// runtime workgroup size (grid * threads)
rocprofiler_dim3_t grid_size; /// runtime grid size
} rocprofiler_profile_counting_dispatch_data_t;
/**
* @brief Kernel Dispatch Callback. This is a callback that is invoked before the kernel
* is enqueued into the HSA queue. What counters to collect for a kernel are set
@@ -44,22 +61,35 @@ ROCPROFILER_EXTERN_C_INIT
* will be collected and emplaced in the buffer with @ref rocprofiler_buffer_id_t used when
* setting up this callback.
*
* @param [in] queue_id Queue the kernel dispatch packet is being enqueued onto
* @param [in] agent Agent of this queue
* @param [in] correlation_id Correlation ID for this dispatch
* @param [in] dispatch_packet Kernel dispatch packet about to be enqueued into HSA
* @param [in] kernel_id Kernel identifier
* @param [in] dispatch_data @see ::rocprofiler_profile_counting_dispatch_data_t
* @param [out] config Profile config detailing the counters to collect for this kernel
* @param [out] user_data User data unique to this dispatch. Returned in record callback
* @param [in] callback_data_args Callback supplied via buffered_dispatch_profile_counting_service
* @param [out] config Profile config detailing the counters to collect for this kernel
*/
typedef void (*rocprofiler_profile_counting_dispatch_callback_t)(
rocprofiler_queue_id_t queue_id,
const rocprofiler_agent_t* agent,
rocprofiler_correlation_id_t correlation_id,
const hsa_kernel_dispatch_packet_t* dispatch_packet,
uint64_t kernel_id,
void* callback_data_args,
rocprofiler_profile_config_id_t* config);
rocprofiler_profile_counting_dispatch_data_t dispatch_data,
rocprofiler_profile_config_id_t* config,
rocprofiler_user_data_t* user_data,
void* callback_data_args);
/**
* @brief Counting record callback. This is a callback is invoked when the kernel
* execution is complete and contains the counter profile data requested in
* @ref rocprofiler_profile_counting_dispatch_callback_t. Only used with
* @ref rocprofiler_configure_callback_dispatch_profile_counting_service.
*
* @param [in] dispatch_data @see ::rocprofiler_profile_counting_dispatch_data_t
* @param [in] record_data Counter record data.
* @param [in] record_count Number of counter records.
* @param [in] user_data User data instance from dispatch callback
* @param [in] callback_data_args Callback supplied via buffered_dispatch_profile_counting_service
*/
typedef void (*rocprofiler_profile_counting_record_callback_t)(
rocprofiler_profile_counting_dispatch_data_t dispatch_data,
rocprofiler_record_counter_t* record_data,
size_t record_count,
rocprofiler_user_data_t user_data,
void* callback_data_args);
/**
* @brief Configure buffered dispatch profile Counting Service.
@@ -95,29 +125,6 @@ rocprofiler_configure_buffered_dispatch_profile_counting_service(
rocprofiler_profile_counting_dispatch_callback_t callback,
void* callback_data_args);
/**
* @brief Counting record callback. This is a callback is invoked when the kernel
* execution is complete and contains the counter profile data requested in
* @ref rocprofiler_profile_counting_dispatch_callback_t. Only used with
* @ref rocprofiler_configure_callback_dispatch_profile_counting_service.
*
* @param [in] queue_id Queue the kernel dispatch packet is being enqueued onto
* @param [in] agent Agent of this queue
* @param [in] correlation_id Correlation ID for this dispatch
* @param [in] kernel_id Kernel identifier
* @param [in] callback_data_args Callback supplied via buffered_dispatch_profile_counting_service
* @param [in] record_count Number of counter records.
* @param [in] record_data Counter record data.
*/
typedef void (*rocprofiler_profile_counting_record_callback_t)(
rocprofiler_queue_id_t queue_id,
rocprofiler_agent_id_t agent,
rocprofiler_correlation_id_t correlation_id,
uint64_t kernel_id,
void* callback_data_args,
size_t record_count,
rocprofiler_record_counter_t* record_data);
/**
* @brief Configure buffered dispatch profile Counting Service.
* Collects the counters in dispatch packets and calls a callback
@@ -261,26 +261,16 @@ typedef enum
ROCPROFILER_TABLE_LAST = ROCPROFILER_MARKER_NAME_TABLE,
} rocprofiler_intercept_table_t;
/**
* @brief Enumeration for specifying the data type contained within the union.
*/
typedef enum
{
ROCPROFILER_UNION_TYPE_NONE = 0, ///< No union type
ROCPROFILER_UNION_TYPE_STRING, ///< String Type set
ROCPROFILER_UNION_TYPE_INT, ///< Integer Type Set
ROCPROFILER_UNION_TYPE_LAST,
} rocprofiler_union_type_t;
/**
* @brief Enumeration for specifying the counter info struct version you want.
*/
typedef enum
{
ROCPROFILER_COUNTER_INFO_VERSION_NONE,
ROCPROFILER_COUNTER_INFO_VERSION_0, ///< @see rocprofiler_counter_info_v0_t
ROCPROFILER_COUNTER_INFO_VERSION_0, ///< @see ::rocprofiler_counter_info_v0_t
ROCPROFILER_COUNTER_INFO_VERSION_LAST,
} rocprofiler_counter_info_version_id_t;
//--------------------------------------------------------------------------------------//
//
// ALIASES
@@ -112,9 +112,27 @@ get_env(std::string_view env_id, Tp _default, std::enable_if_t<std::is_integral<
return _default;
}
int
set_env(std::string_view env_id, bool value, int override)
{
return ::setenv(env_id.data(), (value) ? "1" : "0", override);
}
template <typename Tp>
int
set_env(std::string_view env_id, Tp value, int override)
{
auto str_value = std::stringstream{};
str_value << value;
return ::setenv(env_id.data(), str_value.str().c_str(), override);
}
#define SPECIALIZE_GET_ENV(TYPE) \
template TYPE get_env<TYPE>( \
std::string_view, TYPE, std::enable_if_t<std::is_integral<TYPE>::value, sfinae>);
std::string_view, TYPE, std::enable_if_t<std::is_integral<TYPE>::value, sfinae>); \
template int set_env<TYPE>(std::string_view, TYPE, int);
#define SPECIALIZE_SET_ENV(TYPE) template int set_env<TYPE>(std::string_view, TYPE, int);
SPECIALIZE_GET_ENV(int8_t)
SPECIALIZE_GET_ENV(int16_t)
@@ -124,6 +142,11 @@ SPECIALIZE_GET_ENV(uint8_t)
SPECIALIZE_GET_ENV(uint16_t)
SPECIALIZE_GET_ENV(uint32_t)
SPECIALIZE_GET_ENV(uint64_t)
SPECIALIZE_SET_ENV(const char*)
SPECIALIZE_SET_ENV(std::string)
SPECIALIZE_SET_ENV(float)
SPECIALIZE_SET_ENV(double)
} // namespace impl
} // namespace common
} // namespace rocprofiler
@@ -48,6 +48,13 @@ get_env(std::string_view, bool);
template <typename Tp>
Tp get_env(std::string_view, Tp, std::enable_if_t<std::is_integral<Tp>::value, sfinae> = {});
int
set_env(std::string_view, bool, int override = 0);
template <typename Tp>
int
set_env(std::string_view, Tp, int override = 0);
} // namespace impl
template <typename Tp>
@@ -66,6 +73,13 @@ get_env(std::string_view env_id, Tp&& _default)
}
}
template <typename Tp>
inline auto
set_env(std::string_view env_id, Tp&& value, int override = 0)
{
return impl::set_env(env_id, std::forward<Tp>(value), override);
}
struct env_config
{
std::string env_name = {};
@@ -28,17 +28,28 @@
#include <fstream>
#include <mutex>
#include <string>
#include <unordered_map>
namespace rocprofiler
{
namespace common
{
namespace
{
void
init_logging(std::string_view env_var)
install_failure_signal_handler()
{
static auto _once = std::once_flag{};
std::call_once(_once, [env_var]() {
std::call_once(_once, []() { google::InstallFailureSignalHandler(); });
}
} // namespace
void
init_logging(std::string_view env_var, logging_config cfg)
{
static auto _once = std::once_flag{};
std::call_once(_once, [env_var, &cfg]() {
auto get_argv0 = []() {
auto ifs = std::ifstream{"/proc/self/cmdline"};
auto sarg = std::string{};
@@ -50,18 +61,16 @@ init_logging(std::string_view env_var)
return sarg;
};
static auto argv0 = get_argv0();
google::InitGoogleLogging(argv0.c_str());
auto loglvl = common::get_env(env_var, "error");
auto loglvl = common::get_env(env_var, "");
for(auto& itr : loglvl)
itr = tolower(itr);
// default to warning
auto loglvl_v = google::WARNING;
if(loglvl.find_first_not_of("0123456789") == std::string::npos)
auto& loglvl_v = cfg.loglevel;
if(!loglvl.empty() && loglvl.find_first_not_of("0123456789") == std::string::npos)
{
loglvl_v = std::stoul(loglvl);
}
else
else if(!loglvl.empty())
{
const auto opts =
std::unordered_map<std::string_view, uint32_t>{{"info", google::INFO},
@@ -77,10 +86,39 @@ init_logging(std::string_view env_var)
loglvl_v = opts.at(loglvl);
}
FLAGS_minloglevel = loglvl_v;
FLAGS_stderrthreshold = loglvl_v;
update_logging(cfg, true);
if(!google::IsGoogleLoggingInitialized())
{
static auto argv0 = get_argv0();
google::InitGoogleLogging(argv0.c_str());
}
update_logging(cfg);
LOG(INFO) << "logging initialized via " << env_var;
});
}
void
update_logging(const logging_config& cfg, bool setup_env, int env_override)
{
static auto _mtx = std::mutex{};
auto _lk = std::unique_lock<std::mutex>{_mtx};
FLAGS_timestamp_in_logfile_name = false;
FLAGS_minloglevel = cfg.loglevel;
FLAGS_stderrthreshold = cfg.loglevel;
FLAGS_logtostderr = cfg.logtostderr;
FLAGS_alsologtostderr = cfg.alsologtostderr;
if(cfg.install_failure_handler) install_failure_signal_handler();
if(setup_env)
{
common::set_env("GOOGLE_LOG_DIR", get_env("PWD", ""), env_override);
common::set_env("GOOGLE_LOGTOSTDERR", cfg.loglevel, env_override);
common::set_env("GOOGLE_ALSOLOGTOSTDERR", cfg.alsologtostderr, env_override);
}
}
} // namespace common
} // namespace rocprofiler
@@ -22,13 +22,27 @@
#pragma once
#include <glog/logging.h>
#include <cstdint>
#include <string_view>
namespace rocprofiler
{
namespace common
{
struct logging_config
{
bool install_failure_handler = false;
bool logtostderr = true;
bool alsologtostderr = false;
int32_t loglevel = google::WARNING;
};
void
init_logging(std::string_view env_var);
}
init_logging(std::string_view env_var, logging_config cfg = logging_config{});
void
update_logging(const logging_config& cfg, bool setup_env = false, int env_override = 0);
} // namespace common
} // namespace rocprofiler
@@ -12,7 +12,6 @@ add_subdirectory(plugins)
target_link_libraries(
rocprofiler-sdk-tool
PRIVATE rocprofiler::rocprofiler-shared-library
rocprofiler::rocprofiler-hsa-runtime
rocprofiler::rocprofiler-headers
rocprofiler::rocprofiler-build-flags
rocprofiler::rocprofiler-memcheck
@@ -57,19 +57,22 @@ struct config
{
config();
bool demangle = get_env("ROCPROF_DEMANGLE_KERNELS", true);
bool truncate = get_env("ROCPROF_TRUNCATE_KERNELS", false);
bool kernel_trace = get_env("ROCPROF_KERNEL_TRACE", false);
bool hsa_api_trace = get_env("ROCPROF_HSA_API_TRACE", false);
bool marker_api_trace = get_env("ROCPROF_MARKER_API_TRACE", false);
bool memory_copy_trace = get_env("ROCPROF_MEMORY_COPY_TRACE", false);
bool counter_collection = get_env("ROCPROF_COUNTER_COLLECTION", false);
bool hip_api_trace = get_env("ROCPROF_HIP_API_TRACE", false);
bool hip_compiler_api_trace = get_env("ROCPROF_HIP_COMPILER_API_TRACE", false);
bool list_metrics = get_env("ROCPROF_LIST_METRICS", false);
bool list_metrics_output_file = get_env("ROCPROF_OUTPUT_LIST_METRICS_FILE", false);
int mpi_size = get_mpi_size();
int mpi_rank = get_mpi_rank();
bool demangle = get_env("ROCPROF_DEMANGLE_KERNELS", true);
bool truncate = get_env("ROCPROF_TRUNCATE_KERNELS", false);
bool kernel_trace = get_env("ROCPROF_KERNEL_TRACE", false);
bool hsa_core_api_trace = get_env("ROCPROF_HSA_CORE_API_TRACE", false);
bool hsa_amd_ext_api_trace = get_env("ROCPROF_HSA_AMD_EXT_API_TRACE", false);
bool hsa_image_ext_api_trace = get_env("ROCPROF_HSA_IMAGE_EXT_API_TRACE", false);
bool hsa_finalizer_ext_api_trace = get_env("ROCPROF_HSA_FINALIZER_EXT_API_TRACE", false);
bool marker_api_trace = get_env("ROCPROF_MARKER_API_TRACE", false);
bool memory_copy_trace = get_env("ROCPROF_MEMORY_COPY_TRACE", false);
bool counter_collection = get_env("ROCPROF_COUNTER_COLLECTION", false);
bool hip_runtime_api_trace = get_env("ROCPROF_HIP_RUNTIME_API_TRACE", false);
bool hip_compiler_api_trace = get_env("ROCPROF_HIP_COMPILER_API_TRACE", false);
bool list_metrics = get_env("ROCPROF_LIST_METRICS", false);
bool list_metrics_output_file = get_env("ROCPROF_OUTPUT_LIST_METRICS_FILE", false);
int mpi_size = get_mpi_size();
int mpi_rank = get_mpi_rank();
std::string output_path = get_env("ROCPROF_OUTPUT_PATH", fs::current_path().string());
std::string output_file = get_env("ROCPROF_OUTPUT_FILE_NAME", std::to_string(getpid()));
std::vector<std::string> kernel_names = {};
@@ -22,7 +22,8 @@
#include "helper.hpp"
#include "config.hpp"
#include "rocprofiler-sdk/fwd.h"
#include <rocprofiler-sdk/fwd.h>
#include <glog/logging.h>
@@ -33,198 +34,6 @@
#include <unordered_set>
#include <utility>
namespace
{
using amd_compute_pgm_rsrc_three32_t = uint32_t;
// AMD Compute Program Resource Register Three.
enum amd_compute_gfx9_pgm_rsrc_three_t
{
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_ACCUM_OFFSET, 0, 5),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_TG_SPLIT, 16, 1)
};
enum amd_compute_gfx10_gfx11_pgm_rsrc_three_t
{
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_SHARED_VGPR_COUNT, 0, 4),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_INST_PREF_SIZE, 4, 6),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_TRAP_ON_START, 10, 1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_TRAP_ON_END, 11, 1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_IMAGE_OP, 31, 1)
};
// Kernel code properties.
enum amd_kernel_code_property_t
{
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER,
0,
1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, 1, 1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, 2, 1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR,
3,
1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, 4, 1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
6,
1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_RESERVED0, 7, 3),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
10,
1), // GFX10+
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK, 11, 1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_RESERVED1, 12, 4),
};
std::unordered_map<rocprofiler_address_t, const char*> kernel_descriptor_name_map;
std::mutex kernel_properties_correlation_mutex;
std::unordered_map<uint64_t, rocprofiler_tool_kernel_properties_t>
kernel_properties_correlation_map;
uint32_t
arch_vgpr_count(const std::string_view& name, const kernel_descriptor_t& kernel_code)
{
std::string info_name(name.data(), name.size());
if(strcmp(name.data(), "gfx90a") == 0 || strncmp(name.data(), "gfx94", 5) == 0)
return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc3,
AMD_COMPUTE_PGM_RSRC_THREE_ACCUM_OFFSET) +
1) *
4;
return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1,
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT) +
1) *
(AMD_HSA_BITS_GET(kernel_code.kernel_code_properties,
AMD_KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32)
? 8
: 4);
}
uint32_t
accum_vgpr_count(const std::string_view& name, const kernel_descriptor_t& kernel_code)
{
std::string info_name(name.data(), name.size());
if(strcmp(info_name.c_str(), "gfx908") == 0) return arch_vgpr_count(name, kernel_code);
if(strcmp(info_name.c_str(), "gfx90a") == 0 || strncmp(info_name.c_str(), "gfx94", 5) == 0)
return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1,
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT) +
1) *
8 -
arch_vgpr_count(name, kernel_code);
return 0;
}
uint32_t
sgpr_count(const std::string_view& name, const kernel_descriptor_t& kernel_code)
{
// GFX10 and later always allocate 128 sgprs.
// TODO(srnagara): Recheck the extraction of gfxip from gpu name
const char* name_data = name.data();
const size_t gfxip_label_len = std::min(name.size() - 2, size_t{63});
if(gfxip_label_len > 0 && strnlen(name_data, gfxip_label_len + 1) >= gfxip_label_len)
{
auto gfxip = std::vector<char>{};
gfxip.resize(gfxip_label_len + 1, '\0');
memcpy(gfxip.data(), name_data, gfxip_label_len);
// TODO(srnagara): Check if it is hardcoded
if(std::stoi(&gfxip.at(3)) >= 10) return 128;
return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1,
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT) /
2 +
1) *
16;
}
return 0;
}
const auto&
GetLoaderTable()
{
static const auto _v = []() {
using hsa_loader_table_t = hsa_ven_amd_loader_1_01_pfn_t;
auto _tbl = hsa_loader_table_t{};
memset(&_tbl, 0, sizeof(hsa_loader_table_t));
hsa_system_get_major_extension_table(
HSA_EXTENSION_AMD_LOADER, 1, sizeof(hsa_loader_table_t), &_tbl);
return _tbl;
}();
return _v;
}
const kernel_descriptor_t*
GetKernelCode(uint64_t kernel_object)
{
const kernel_descriptor_t* kernel_code = nullptr;
if(GetLoaderTable().hsa_ven_amd_loader_query_host_address == nullptr) return kernel_code;
hsa_status_t status = GetLoaderTable().hsa_ven_amd_loader_query_host_address(
reinterpret_cast<const void*>(kernel_object), // NOLINT(performance-no-int-to-ptr)
reinterpret_cast<const void**>(&kernel_code));
if(HSA_STATUS_SUCCESS != status)
{
kernel_code = reinterpret_cast<kernel_descriptor_t*>( // NOLINT(performance-no-int-to-ptr)
kernel_object);
}
return kernel_code;
}
} // namespace
void
SetKernelProperties(uint64_t correlation_id, rocprofiler_tool_kernel_properties_t kernel_properties)
{
std::lock_guard<std::mutex> kernel_properties_correlation_map_lock(
kernel_properties_correlation_mutex);
kernel_properties_correlation_map[correlation_id] = std::move(kernel_properties);
}
rocprofiler_tool_kernel_properties_t
GetKernelProperties(uint64_t correlation_id)
{
std::lock_guard<std::mutex> kernel_properties_correlation_map_lock(
kernel_properties_correlation_mutex);
auto it = kernel_properties_correlation_map.find(correlation_id);
if(it == kernel_properties_correlation_map.end())
{
std::cout << "kernel properties not found" << std::endl;
abort();
}
return it->second;
}
void
populate_kernel_properties_data(rocprofiler_tool_kernel_properties_t* kernel_properties,
const hsa_kernel_dispatch_packet_t* dispatch_packet)
{
const uint64_t kernel_object = dispatch_packet->kernel_object;
const kernel_descriptor_t* kernel_code = GetKernelCode(kernel_object);
uint64_t grid_size =
dispatch_packet->grid_size_x * dispatch_packet->grid_size_y * dispatch_packet->grid_size_z;
if(grid_size > UINT32_MAX) abort();
kernel_properties->grid_size = grid_size;
uint64_t workgroup_size = dispatch_packet->workgroup_size_x *
dispatch_packet->workgroup_size_y * dispatch_packet->workgroup_size_z;
if(workgroup_size > UINT32_MAX) abort();
kernel_properties->workgroup_size = (uint32_t) workgroup_size;
kernel_properties->lds_size = dispatch_packet->group_segment_size;
kernel_properties->scratch_size = dispatch_packet->private_segment_size;
kernel_properties->arch_vgpr_count =
arch_vgpr_count(kernel_properties->gpu_agent.name, *kernel_code);
kernel_properties->accum_vgpr_count =
accum_vgpr_count(kernel_properties->gpu_agent.name, *kernel_code);
kernel_properties->sgpr_count = sgpr_count(kernel_properties->gpu_agent.name, *kernel_code);
kernel_properties->wave_size =
AMD_HSA_BITS_GET(kernel_code->kernel_code_properties,
AMD_KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32)
? 32
: 64;
kernel_properties->signal_handle = dispatch_packet->completion_signal.handle;
}
rocprofiler_tool_buffer_name_info_t
get_buffer_id_names()
{
@@ -71,46 +71,6 @@
constexpr size_t BUFFER_SIZE_BYTES = 4096;
constexpr size_t WATERMARK = (BUFFER_SIZE_BYTES / 2);
// This can be different for different architecture
// Lets follow the v1 rocprof
// I will have a kernel id from the rocprofiler
// address the kernel descriptor and access the information
// This works for gfx9 but may not for Navi arch
// Interecept the kernel symbol load build a table for kernel id
// when kenel dispatch callback. Here is the kernel id
// Use the kernel id
typedef struct
{
uint64_t grid_size;
uint64_t workgroup_size;
uint64_t lds_size;
uint64_t scratch_size;
uint64_t arch_vgpr_count;
uint64_t accum_vgpr_count;
uint64_t sgpr_count;
uint64_t wave_size;
uint64_t signal_handle;
uint64_t kernel_object;
rocprofiler_queue_id_t queue_id;
std::string kernel_name;
rocprofiler_agent_t gpu_agent;
uint64_t thread_id;
uint64_t dispatch_index;
} rocprofiler_tool_kernel_properties_t;
struct kernel_descriptor_t
{
uint8_t reserved0[16];
int64_t kernel_code_entry_byte_offset;
uint8_t reserved1[20];
uint32_t compute_pgm_rsrc3;
uint32_t compute_pgm_rsrc1;
uint32_t compute_pgm_rsrc2;
uint16_t kernel_code_properties;
uint8_t reserved2[6];
};
using rocprofiler_tool_buffer_kind_names_t =
std::unordered_map<rocprofiler_buffer_tracing_kind_t, const char*>;
using rocprofiler_tool_buffer_kind_operation_names_t =
@@ -135,29 +95,6 @@ struct rocprofiler_tool_callback_name_info_t
rocprofiler_tool_callback_kind_operation_names_t operation_names = {};
};
// std::vector<std::string>
// GetCounterNames();
void
SetKernelDescriptorName(rocprofiler_address_t kernel_descriptor, const char* name);
void
SetKernelProperties(uint64_t correlation_id,
rocprofiler_tool_kernel_properties_t kernel_properties);
void
SetKernelProperties(uint64_t correlation_id,
rocprofiler_tool_kernel_properties_t kernel_properties);
rocprofiler_tool_kernel_properties_t
GetKernelProperties(uint64_t correlation_id);
const char*
GetKernelDescriptorName(rocprofiler_address_t kernel_descriptor);
void
populate_kernel_properties_data(rocprofiler_tool_kernel_properties_t* kernel_properties,
const hsa_kernel_dispatch_packet_t* dispatch_packet);
rocprofiler_tool_buffer_name_info_t
get_buffer_id_names();
@@ -33,6 +33,8 @@
#include "lib/common/utility.hpp"
#include <rocprofiler-sdk/agent.h>
#include <rocprofiler-sdk/callback_tracing.h>
#include <rocprofiler-sdk/external_correlation.h>
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/internal_threading.h>
#include <rocprofiler-sdk/marker/api_id.h>
@@ -55,6 +57,21 @@
namespace common = ::rocprofiler::common;
namespace tool = ::rocprofiler::tool;
namespace std
{
template <>
struct hash<rocprofiler_agent_id_t>
{
size_t operator()(rocprofiler_agent_id_t id) const { return id.handle; }
};
} // namespace std
inline bool
operator==(rocprofiler_agent_id_t lhs, rocprofiler_agent_id_t rhs)
{
return (lhs.handle == rhs.handle);
}
namespace
{
constexpr uint32_t lds_block_size = 128 * 4;
@@ -68,16 +85,23 @@ get_dereference(Tp* ptr)
return *CHECK_NOTNULL(ptr);
}
template <typename Tp>
void
add_destructor(Tp*& ptr)
auto
get_destructors_lock()
{
static auto _mutex = std::mutex{};
auto _lk = std::unique_lock<std::mutex>{_mutex};
return std::unique_lock<std::mutex>{_mutex};
}
template <typename Tp>
Tp*&
add_destructor(Tp*& ptr)
{
auto _lk = get_destructors_lock();
destructors->emplace_back([&ptr]() {
delete ptr;
ptr = nullptr;
});
return ptr;
}
#define ADD_DESTRUCTOR(PTR) \
@@ -155,7 +179,7 @@ get_counter_collection_file()
"Process_Id",
"Thread_Id",
"Grid_Size",
"Kernel-Name",
"Kernel_Name",
"Workgroup_Size",
"LDS_Block_Size",
"Scratch_Size",
@@ -255,6 +279,7 @@ get_buffers()
return _v;
}
using rocprofiler_code_object_data_t = rocprofiler_callback_tracing_code_object_load_data_t;
using rocprofiler_kernel_symbol_data_t =
rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t;
@@ -274,14 +299,46 @@ struct kernel_symbol_data : rocprofiler_kernel_symbol_data_t
std::string truncated_kernel_name = {};
};
template <typename Tp>
Tp*
as_pointer(Tp&& _val)
{
return new Tp{std::forward<Tp>(_val)};
}
using code_object_data_map_t = std::unordered_map<uint64_t, rocprofiler_code_object_data_t>;
using kernel_symbol_data_map_t = std::unordered_map<rocprofiler_kernel_id_t, kernel_symbol_data>;
auto kernel_data = common::Synchronized<kernel_symbol_data_map_t, true>{};
using targeted_kernels_set_t = std::unordered_set<rocprofiler_kernel_id_t>;
using counter_dimension_info_map_t =
std::unordered_map<uint64_t, std::vector<rocprofiler_record_dimension_info_t>>;
std::atomic<uint64_t> dispatch_index{0};
auto counter_dimension_data = common::Synchronized<counter_dimension_info_map_t, true>{};
auto buffered_name_info = get_buffer_id_names();
auto callback_name_info = get_callback_id_names();
auto code_obj_data = common::Synchronized<code_object_data_map_t, true>{};
auto kernel_data = common::Synchronized<kernel_symbol_data_map_t, true>{};
auto counter_dimension_data = common::Synchronized<counter_dimension_info_map_t, true>{};
auto target_kernels = common::Synchronized<targeted_kernels_set_t>{};
auto dispatch_index = std::atomic<uint64_t>{0};
auto* buffered_name_info = as_pointer(get_buffer_id_names());
auto* callback_name_info = as_pointer(get_callback_id_names());
bool
add_kernel_target(uint64_t _kern_id)
{
return target_kernels
.wlock([](targeted_kernels_set_t& _targets_v,
uint64_t _kern_id_v) { return _targets_v.emplace(_kern_id_v); },
_kern_id)
.second;
}
bool
is_targeted_kernel(uint64_t _kern_id)
{
return target_kernels.rlock(
[](const targeted_kernels_set_t& _targets_v, uint64_t _kern_id_v) {
return (_targets_v.count(_kern_id_v) > 0);
},
_kern_id);
}
auto&
get_client_ctx()
@@ -328,15 +385,16 @@ cntrl_tracing_callback(rocprofiler_callback_tracing_record_t record,
auto ts = rocprofiler_timestamp_t{};
rocprofiler_get_timestamp(&ts);
const auto* kind_name = callback_name_info.kind_names.at(record.kind);
const auto* kind_name = CHECK_NOTNULL(callback_name_info)->kind_names.at(record.kind);
if(record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER)
{
user_data->value = ts;
}
else
{
const auto* op_name =
callback_name_info.operation_names.at(record.kind).at(record.operation);
const auto* op_name = CHECK_NOTNULL(callback_name_info)
->operation_names.at(record.kind)
.at(record.operation);
auto ss = std::stringstream{};
tool::csv::marker_csv_encoder::write_row(ss,
kind_name,
@@ -368,7 +426,7 @@ callback_tracing_callback(rocprofiler_callback_tracing_record_t record,
auto ts = rocprofiler_timestamp_t{};
rocprofiler_get_timestamp(&ts);
const auto* kind_name = callback_name_info.kind_names.at(record.kind);
const auto* kind_name = CHECK_NOTNULL(callback_name_info)->kind_names.at(record.kind);
if(record.operation == ROCPROFILER_MARKER_CORE_API_ID_roctxMarkA)
{
if(record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT)
@@ -460,8 +518,9 @@ callback_tracing_callback(rocprofiler_callback_tracing_record_t record,
}
else
{
const auto* op_name =
callback_name_info.operation_names.at(record.kind).at(record.operation);
const auto* op_name = CHECK_NOTNULL(callback_name_info)
->operation_names.at(record.kind)
.at(record.operation);
auto ss = std::stringstream{};
tool::csv::marker_csv_encoder::write_row(ss,
kind_name,
@@ -481,78 +540,6 @@ callback_tracing_callback(rocprofiler_callback_tracing_record_t record,
(void) data;
}
void
counter_record_callback(rocprofiler_queue_id_t,
const rocprofiler_agent_id_t,
rocprofiler_correlation_id_t correlation_id,
uint64_t,
void*,
size_t record_count,
rocprofiler_record_counter_t* record_data)
{
rocprofiler_tool_kernel_properties_t kernel_properties =
GetKernelProperties(correlation_id.internal);
std::map<const char*, uint64_t> counter_name_value;
for(size_t count = 0; count < record_count; count++)
{
auto profiler_record = static_cast<rocprofiler_record_counter_t>(record_data[count]);
rocprofiler_counter_id_t counter_id;
rocprofiler_query_record_counter_id(profiler_record.id, &counter_id);
rocprofiler_counter_info_v0_t version;
ROCPROFILER_CALL(
rocprofiler_query_counter_info(
counter_id, ROCPROFILER_COUNTER_INFO_VERSION_0, static_cast<void*>(&version)),
"Could not query counter_id");
const auto& dimension_pos_ss = counter_dimension_data.rlock(
[&profiler_record](const counter_dimension_info_map_t& counter_dimension_data_v,
uint64_t handle) {
auto dimensions = counter_dimension_data_v.at(handle);
size_t pos;
auto pos_ss = std::stringstream{};
size_t num_dim = dimensions.size();
for(size_t idx = 0; idx != num_dim; idx++)
{
rocprofiler_query_record_dimension_position(
profiler_record.id, dimensions[idx].id, &pos);
pos_ss << dimensions[idx].name << ":" << pos;
if(idx != num_dim - 1) pos_ss << ",";
}
return pos_ss;
},
counter_id.handle);
auto search = counter_name_value.find(version.name);
if(search == counter_name_value.end())
counter_name_value.emplace(
std::pair<const char*, uint64_t>{version.name, profiler_record.counter_value});
else
search->second = search->second + profiler_record.counter_value;
}
for(auto itr = counter_name_value.begin(); itr != counter_name_value.end(); ++itr)
{
auto counter_collection_ss = std::stringstream{};
tool::csv::counter_collection_csv_encoder::write_row(
counter_collection_ss,
correlation_id.internal,
kernel_properties.dispatch_index,
kernel_properties.gpu_agent.id.handle,
kernel_properties.queue_id.handle,
getpid(),
kernel_properties.thread_id,
kernel_properties.grid_size,
kernel_properties.kernel_name,
kernel_properties.workgroup_size,
((kernel_properties.lds_size + (lds_block_size - 1)) & ~(lds_block_size - 1)),
kernel_properties.scratch_size,
kernel_properties.arch_vgpr_count,
kernel_properties.sgpr_count,
itr->first,
itr->second);
get_dereference(get_counter_collection_file()) << counter_collection_ss.str();
}
}
void
code_object_tracing_callback(rocprofiler_callback_tracing_record_t record,
rocprofiler_user_data_t* user_data,
@@ -561,7 +548,19 @@ code_object_tracing_callback(rocprofiler_callback_tracing_record_t record,
if(record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT &&
record.operation == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_LOAD)
{
if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD)
if(record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD)
{
auto* obj_data = static_cast<rocprofiler_code_object_data_t*>(record.payload);
if(record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD)
{
code_obj_data.wlock(
[](code_object_data_map_t& cdata, rocprofiler_code_object_data_t* obj_data_v) {
cdata.emplace(obj_data_v->code_object_id, *obj_data_v);
},
CHECK_NOTNULL(obj_data));
}
}
else if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD)
{
flush();
}
@@ -573,11 +572,50 @@ code_object_tracing_callback(rocprofiler_callback_tracing_record_t record,
auto* sym_data = static_cast<rocprofiler_kernel_symbol_data_t*>(record.payload);
if(record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD)
{
kernel_data.wlock(
auto itr = kernel_data.wlock(
[](kernel_symbol_data_map_t& kdata, rocprofiler_kernel_symbol_data_t* sym_data_v) {
kdata.emplace(sym_data_v->kernel_id, kernel_symbol_data{*sym_data_v});
return kdata.emplace(sym_data_v->kernel_id, kernel_symbol_data{*sym_data_v});
},
sym_data);
CHECK_NOTNULL(sym_data));
LOG_IF(WARNING, !itr.second)
<< "duplicate kernel symbol data for kernel_id=" << sym_data->kernel_id;
// add the kernel to the kernel_targets if
if(itr.second)
{
// if kernel name is provided by user then by default all kernels in the application
// are targeted
if(tool::get_config().kernel_names.empty())
{
add_kernel_target(sym_data->kernel_id);
}
else
{
const auto& kernel_info = itr.first->second;
for(const auto& name : tool::get_config().kernel_names)
{
if(name == kernel_info.truncated_kernel_name)
{
add_kernel_target(itr.first->first);
break;
}
else
{
auto dkernel_name = std::string_view{kernel_info.demangled_kernel_name};
auto pos = dkernel_name.find(name);
// if the demangled kernel name contains name and the next character is
// '(' then mark as found
if(pos != std::string::npos && (pos + 1) < dkernel_name.size() &&
dkernel_name.at(pos + 1) == '(')
{
add_kernel_target(itr.first->first);
break;
}
}
}
}
}
}
}
@@ -622,7 +660,7 @@ buffered_tracing_callback(rocprofiler_context_id_t /*context*/,
auto kernel_trace_ss = std::stringstream{};
tool::csv::kernel_trace_csv_encoder::write_row(
kernel_trace_ss,
buffered_name_info.kind_names.at(record->kind),
CHECK_NOTNULL(buffered_name_info)->kind_names.at(record->kind),
record->agent_id.handle,
record->queue_id.handle,
record->kernel_id,
@@ -652,8 +690,10 @@ buffered_tracing_callback(rocprofiler_context_id_t /*context*/,
auto hsa_trace_ss = std::stringstream{};
tool::csv::api_csv_encoder::write_row(
hsa_trace_ss,
buffered_name_info.kind_names.at(record->kind),
buffered_name_info.operation_names.at(record->kind).at(record->operation),
CHECK_NOTNULL(buffered_name_info)->kind_names.at(record->kind),
CHECK_NOTNULL(buffered_name_info)
->operation_names.at(record->kind)
.at(record->operation),
getpid(),
record->thread_id,
record->correlation_id.internal,
@@ -670,8 +710,10 @@ buffered_tracing_callback(rocprofiler_context_id_t /*context*/,
auto memory_copy_trace_ss = std::stringstream{};
tool::csv::memory_copy_csv_encoder::write_row(
memory_copy_trace_ss,
buffered_name_info.kind_names.at(record->kind),
buffered_name_info.operation_names.at(record->kind).at(record->operation),
CHECK_NOTNULL(buffered_name_info)->kind_names.at(record->kind),
CHECK_NOTNULL(buffered_name_info)
->operation_names.at(record->kind)
.at(record->operation),
record->src_agent_id.handle,
record->dst_agent_id.handle,
record->correlation_id.internal,
@@ -689,8 +731,10 @@ buffered_tracing_callback(rocprofiler_context_id_t /*context*/,
auto hip_trace_ss = std::stringstream{};
tool::csv::api_csv_encoder::write_row(
hip_trace_ss,
buffered_name_info.kind_names.at(record->kind),
buffered_name_info.operation_names.at(record->kind).at(record->operation),
CHECK_NOTNULL(buffered_name_info)->kind_names.at(record->kind),
CHECK_NOTNULL(buffered_name_info)
->operation_names.at(record->kind)
.at(record->operation),
getpid(),
record->thread_id,
record->correlation_id.internal,
@@ -710,7 +754,7 @@ buffered_tracing_callback(rocprofiler_context_id_t /*context*/,
using counter_vec_t = std::vector<rocprofiler_counter_id_t>;
using agent_counter_map_t =
std::unordered_map<const rocprofiler_agent_t*, std::optional<rocprofiler_profile_config_id_t>>;
std::unordered_map<rocprofiler_agent_id_t, std::optional<rocprofiler_profile_config_id_t>>;
rocprofiler_status_t
dimensions_info_callback(rocprofiler_counter_id_t id,
@@ -740,14 +784,14 @@ dimensions_info_callback(rocprofiler_counter_id_t id,
// this function creates a rocprofiler profile config on the first entry
auto
get_agent_profile(const rocprofiler_agent_t* agent)
get_agent_profile(rocprofiler_agent_id_t agent_id)
{
static auto data = common::Synchronized<agent_counter_map_t>{};
auto profile = std::optional<rocprofiler_profile_config_id_t>{};
data.ulock(
[agent, &profile](const agent_counter_map_t& data_v) {
auto itr = data_v.find(agent);
[agent_id, &profile](const agent_counter_map_t& data_v) {
auto itr = data_v.find(agent_id);
if(itr != data_v.end())
{
profile = itr->second;
@@ -755,11 +799,11 @@ get_agent_profile(const rocprofiler_agent_t* agent)
}
return false;
},
[agent, &profile](agent_counter_map_t& data_v) {
[agent_id, &profile](agent_counter_map_t& data_v) {
auto counters_v = counter_vec_t{};
ROCPROFILER_CALL(
rocprofiler_iterate_agent_supported_counters(
agent->id,
agent_id,
[](rocprofiler_agent_id_t,
rocprofiler_counter_id_t* counters,
size_t num_counters,
@@ -771,15 +815,15 @@ get_agent_profile(const rocprofiler_agent_t* agent)
counters[i], dimensions_info_callback, nullptr),
"iterate_dimension_info");
rocprofiler_counter_info_v0_t version;
rocprofiler_counter_info_v0_t info;
ROCPROFILER_CALL(
rocprofiler_query_counter_info(counters[i],
ROCPROFILER_COUNTER_INFO_VERSION_0,
static_cast<void*>(&version)),
static_cast<void*>(&info)),
"Could not query counter_id");
if(tool::get_config().counters.count(version.name) > 0)
if(tool::get_config().counters.count(info.name) > 0)
vec->emplace_back(counters[i]);
}
return ROCPROFILER_STATUS_SUCCESS;
@@ -791,18 +835,122 @@ get_agent_profile(const rocprofiler_agent_t* agent)
{
auto profile_v = rocprofiler_profile_config_id_t{};
ROCPROFILER_CALL(rocprofiler_create_profile_config(
agent->id, counters_v.data(), counters_v.size(), &profile_v),
agent_id, counters_v.data(), counters_v.size(), &profile_v),
"Could not construct profile cfg");
profile = profile_v;
}
data_v.emplace(agent, profile);
data_v.emplace(agent_id, profile);
return true;
});
return profile;
}
struct counter_dispatch_data
{
uint64_t thread_id = 0;
uint64_t dispatch_index = 0;
};
void
dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
rocprofiler_profile_config_id_t* config,
rocprofiler_user_data_t* user_data,
void* /*callback_data_args*/)
{
auto kernel_id = dispatch_data.kernel_id;
auto agent_id = dispatch_data.agent_id;
if(!is_targeted_kernel(kernel_id))
{
return;
}
else if(auto profile = get_agent_profile(agent_id))
{
*config = *profile;
user_data->ptr = new counter_dispatch_data{.thread_id = common::get_tid(),
.dispatch_index = ++dispatch_index};
}
}
void
counter_record_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
rocprofiler_record_counter_t* record_data,
size_t record_count,
rocprofiler_user_data_t user_data,
void* /*callback_data_args*/)
{
auto kernel_id = dispatch_data.kernel_id;
const auto* cnt_dispatch_data_v = static_cast<counter_dispatch_data*>(user_data.ptr);
const auto* kernel_info = kernel_data.rlock(
[](const kernel_symbol_data_map_t& kdata, uint64_t kid) -> const auto* {
return &kdata.at(kid);
},
kernel_id);
LOG_IF(FATAL, !kernel_info) << "missing kernel information for kernel_id=" << kernel_id;
LOG_IF(ERROR, record_count == 0) << "zero record count for kernel_id=" << kernel_id
<< " (name=" << kernel_info->kernel_name << ")";
auto counter_name_value = std::map<const char*, uint64_t>{};
for(size_t count = 0; count < record_count; count++)
{
auto profiler_record = static_cast<rocprofiler_record_counter_t>(record_data[count]);
auto counter_id = rocprofiler_counter_id_t{};
auto info = rocprofiler_counter_info_v0_t{};
ROCPROFILER_CALL(rocprofiler_query_record_counter_id(profiler_record.id, &counter_id),
"query record counter id");
ROCPROFILER_CALL(
rocprofiler_query_counter_info(
counter_id, ROCPROFILER_COUNTER_INFO_VERSION_0, static_cast<void*>(&info)),
"query counter info");
auto search = counter_name_value.find(info.name);
if(search == counter_name_value.end())
counter_name_value.emplace(
std::pair<const char*, uint64_t>{info.name, profiler_record.counter_value});
else
search->second = search->second + profiler_record.counter_value;
}
auto lds_block_size_v =
(kernel_info->group_segment_size + (lds_block_size - 1)) & ~(lds_block_size - 1);
const auto& correlation_id = dispatch_data.correlation_id;
auto magnitude = [](rocprofiler_dim3_t dims) { return (dims.x * dims.y * dims.z); };
for(auto& itr : counter_name_value)
{
using csv_encoder = tool::csv::counter_collection_csv_encoder;
auto counter_collection_ss = std::stringstream{};
csv_encoder::write_row(counter_collection_ss,
correlation_id.internal,
cnt_dispatch_data_v->dispatch_index,
dispatch_data.agent_id.handle,
dispatch_data.queue_id.handle,
getpid(),
cnt_dispatch_data_v->thread_id,
magnitude(dispatch_data.grid_size),
kernel_info->formatted_kernel_name,
magnitude(dispatch_data.workgroup_size),
lds_block_size_v,
kernel_info->private_segment_size,
kernel_info->arch_vgpr_count,
kernel_info->sgpr_count,
itr.first,
itr.second);
get_dereference(get_counter_collection_file()) << counter_collection_ss.str();
}
delete cnt_dispatch_data_v;
}
rocprofiler_status_t
list_metrics_iterate_agents(rocprofiler_agent_version_t,
const void** agents,
@@ -912,61 +1060,6 @@ list_metrics_iterate_agents(rocprofiler_agent_version_t,
return ROCPROFILER_STATUS_SUCCESS;
}
void
dispatch_callback(rocprofiler_queue_id_t queue_id,
const rocprofiler_agent_t* agent,
rocprofiler_correlation_id_t correlation_id,
const hsa_kernel_dispatch_packet_t* dispatch_packet,
uint64_t kernel_id,
void* /*callback_data_args*/,
rocprofiler_profile_config_id_t* config)
{
rocprofiler_tool_kernel_properties_t kernel_properties;
const auto& kernel_info =
kernel_data.rlock([](const kernel_symbol_data_map_t& kdata,
uint64_t kernel_id_v) { return kdata.at(kernel_id_v); },
kernel_id);
auto is_targeted_kernel = [&kernel_info]() {
// if kernel name is provided by user then by default all kernels in the application are
// targeted
if(tool::get_config().kernel_names.empty()) return true;
for(const auto& name : tool::get_config().kernel_names)
{
if(name == kernel_info.truncated_kernel_name)
return true;
else
{
auto dkernel_name = std::string_view{kernel_info.demangled_kernel_name};
auto pos = dkernel_name.find(name);
// if the demangled kernel name contains name and the next character is '(' then
// mark as found
if(pos != std::string::npos && (pos + 1) < dkernel_name.size() &&
dkernel_name.at(pos + 1) == '(')
return true;
}
}
return false;
};
if(!is_targeted_kernel()) return;
auto profile = get_agent_profile(agent);
if(profile)
{
kernel_properties.kernel_name = kernel_info.formatted_kernel_name;
kernel_properties.dispatch_index = ++dispatch_index;
kernel_properties.queue_id = queue_id;
kernel_properties.gpu_agent = *agent;
kernel_properties.thread_id = common::get_tid();
populate_kernel_properties_data(&kernel_properties, dispatch_packet);
SetKernelProperties(correlation_id.internal, kernel_properties);
*config = *profile;
}
}
rocprofiler_client_finalize_t client_finalizer = nullptr;
rocprofiler_client_id_t* client_identifier = nullptr;
@@ -1059,7 +1152,8 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
"buffer tracing service for memory copy configure");
}
if(tool::get_config().hsa_api_trace)
if(tool::get_config().hsa_core_api_trace || tool::get_config().hsa_amd_ext_api_trace ||
tool::get_config().hsa_image_ext_api_trace || tool::get_config().hsa_finalizer_ext_api_trace)
{
ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(),
buffer_size,
@@ -1070,18 +1164,27 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
&get_buffers().hsa_api_trace),
"buffer creation");
for(auto itr : {ROCPROFILER_BUFFER_TRACING_HSA_CORE_API,
ROCPROFILER_BUFFER_TRACING_HSA_AMD_EXT_API,
ROCPROFILER_BUFFER_TRACING_HSA_IMAGE_EXT_API,
ROCPROFILER_BUFFER_TRACING_HSA_FINALIZE_EXT_API})
using optpair_t = std::pair<bool, rocprofiler_buffer_tracing_kind_t>;
for(auto itr : {optpair_t{tool::get_config().hsa_core_api_trace,
ROCPROFILER_BUFFER_TRACING_HSA_CORE_API},
optpair_t{tool::get_config().hsa_core_api_trace,
ROCPROFILER_BUFFER_TRACING_HSA_AMD_EXT_API},
optpair_t{tool::get_config().hsa_core_api_trace,
ROCPROFILER_BUFFER_TRACING_HSA_IMAGE_EXT_API},
optpair_t{tool::get_config().hsa_core_api_trace,
ROCPROFILER_BUFFER_TRACING_HSA_FINALIZE_EXT_API}})
{
ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service(
get_client_ctx(), itr, nullptr, 0, get_buffers().hsa_api_trace),
"buffer tracing service for hsa api configure");
if(itr.first)
{
ROCPROFILER_CALL(
rocprofiler_configure_buffer_tracing_service(
get_client_ctx(), itr.second, nullptr, 0, get_buffers().hsa_api_trace),
"buffer tracing service for hsa api configure");
}
}
}
if(tool::get_config().hip_api_trace || tool::get_config().hip_compiler_api_trace)
if(tool::get_config().hip_runtime_api_trace || tool::get_config().hip_compiler_api_trace)
{
ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(),
buffer_size,
@@ -1092,7 +1195,7 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
&get_buffers().hip_api_trace),
"buffer creation");
if(tool::get_config().hip_api_trace)
if(tool::get_config().hip_runtime_api_trace)
{
ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service(
get_client_ctx(),
@@ -1187,7 +1290,8 @@ rocprofiler_configure(uint32_t version,
uint32_t priority,
rocprofiler_client_id_t* id)
{
common::init_logging("ROCPROF_LOG_LEVEL");
auto logging_cfg = rocprofiler::common::logging_config{.install_failure_handler = true};
common::init_logging("ROCPROF_LOG_LEVEL", logging_cfg);
FLAGS_colorlogtostderr = true;
// set the client name
@@ -1205,6 +1309,10 @@ rocprofiler_configure(uint32_t version,
uint32_t minor = (version % 10000) / 100;
uint32_t patch = version % 100;
// ensure these pointers are not leaked
add_destructor(buffered_name_info);
add_destructor(callback_name_info);
if(tool::get_config().list_metrics)
{
ROCPROFILER_CALL(rocprofiler_at_intercept_table_registration(
@@ -67,8 +67,9 @@ set_target_properties(rocprofiler-object-library PROPERTIES POSITION_INDEPENDENT
add_library(rocprofiler-shared-library SHARED)
add_library(rocprofiler::rocprofiler-shared-library ALIAS rocprofiler-shared-library)
target_sources(rocprofiler-shared-library
PRIVATE $<TARGET_OBJECTS:rocprofiler::rocprofiler-object-library>)
target_sources(
rocprofiler-shared-library
PRIVATE $<TARGET_OBJECTS:rocprofiler::rocprofiler-object-library> shared_library.cpp)
target_link_libraries(
rocprofiler-shared-library
INTERFACE rocprofiler::rocprofiler-headers
@@ -48,6 +48,14 @@
namespace fs = rocprofiler::common::filesystem;
#if defined(ROCPROFILER_CI)
# define ROCP_CI_LOG_IF(NON_CI_LEVEL, ...) LOG_IF(FATAL, __VA_ARGS__)
# define ROCP_CI_LOG(NON_CI_LEVEL, ...) LOG(FATAL)
#else
# define ROCP_CI_LOG_IF(NON_CI_LEVEL, ...) LOG_IF(NON_CI_LEVEL, __VA_ARGS__)
# define ROCP_CI_LOG(NON_CI_LEVEL, ...) LOG(NON_CI_LEVEL)
#endif
namespace rocprofiler
{
namespace agent
@@ -697,7 +705,7 @@ construct_agent_cache(::HsaApiTable* table)
},
&hsa_agents);
LOG_IF(FATAL, rocp_agents.size() != hsa_agents.size())
ROCP_CI_LOG_IF(ERROR, rocp_agents.size() != hsa_agents.size())
<< "Found " << rocp_agents.size() << " rocprofiler agents and " << hsa_agents.size()
<< " HSA agents";
@@ -24,6 +24,7 @@
#include "lib/common/container/small_vector.hpp"
#include "lib/common/synchronized.hpp"
#include "lib/common/utility.hpp"
#include "lib/rocprofiler-sdk/agent.hpp"
#include "lib/rocprofiler-sdk/aql/helpers.hpp"
#include "lib/rocprofiler-sdk/aql/packet_construct.hpp"
@@ -32,6 +33,7 @@
#include "lib/rocprofiler-sdk/hsa/queue_controller.hpp"
#include "lib/rocprofiler-sdk/registration.hpp"
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/rocprofiler.h>
namespace rocprofiler
@@ -255,6 +257,7 @@ queue_cb(const std::shared_ptr<counter_callback_info>& info,
const hsa::Queue& queue,
const hsa::rocprofiler_packet& pkt,
uint64_t kernel_id,
rocprofiler_user_data_t* user_data,
const hsa::Queue::queue_info_session_t::external_corr_id_map_t& extern_corr_ids,
const context::correlation_id* correlation_id)
{
@@ -272,14 +275,25 @@ queue_cb(const std::shared_ptr<counter_callback_info>& info,
}
}
rocprofiler_profile_config_id_t req_profile = {.handle = 0};
info->user_cb(queue.get_id(),
queue.get_agent().get_rocp_agent(),
_corr_id_v,
&pkt.kernel_dispatch,
kernel_id,
info->callback_args,
&req_profile);
auto req_profile = rocprofiler_profile_config_id_t{.handle = 0};
auto dispatch_data =
common::init_public_api_struct(rocprofiler_profile_counting_dispatch_data_t{});
dispatch_data.kernel_id = kernel_id;
dispatch_data.agent_id = CHECK_NOTNULL(queue.get_agent().get_rocp_agent())->id;
dispatch_data.queue_id = queue.get_id();
dispatch_data.correlation_id = _corr_id_v;
dispatch_data.private_segment_size = pkt.kernel_dispatch.private_segment_size;
dispatch_data.group_segment_size = pkt.kernel_dispatch.group_segment_size;
dispatch_data.workgroup_size = {pkt.kernel_dispatch.workgroup_size_x,
pkt.kernel_dispatch.workgroup_size_y,
pkt.kernel_dispatch.workgroup_size_z};
dispatch_data.grid_size = {pkt.kernel_dispatch.grid_size_x,
pkt.kernel_dispatch.grid_size_y,
pkt.kernel_dispatch.grid_size_z};
info->user_cb(dispatch_data, &req_profile, user_data, info->callback_args);
if(req_profile.handle == 0) return nullptr;
auto prof_config = get_controller().get_profile_cfg(req_profile);
@@ -407,13 +421,27 @@ completed_cb(const std::shared_ptr<counter_callback_info>& info,
if(!out.empty())
{
CHECK(info->record_callback);
info->record_callback(queue.get_id(),
queue.get_agent().get_rocp_agent()->id,
_corr_id_v,
session.kernel_id,
info->record_callback_args,
out.size(),
out.data());
auto dispatch_data =
common::init_public_api_struct(rocprofiler_profile_counting_dispatch_data_t{});
const auto& kernel_dispatch_pkt = session.kernel_pkt.kernel_dispatch;
dispatch_data.kernel_id = session.kernel_id;
dispatch_data.agent_id = CHECK_NOTNULL(queue.get_agent().get_rocp_agent())->id;
dispatch_data.queue_id = queue.get_id();
dispatch_data.correlation_id = _corr_id_v;
dispatch_data.private_segment_size = kernel_dispatch_pkt.private_segment_size;
dispatch_data.group_segment_size = kernel_dispatch_pkt.group_segment_size;
dispatch_data.workgroup_size = {kernel_dispatch_pkt.workgroup_size_x,
kernel_dispatch_pkt.workgroup_size_y,
kernel_dispatch_pkt.workgroup_size_z};
dispatch_data.grid_size = {kernel_dispatch_pkt.grid_size_x,
kernel_dispatch_pkt.grid_size_y,
kernel_dispatch_pkt.grid_size_z};
info->record_callback(
dispatch_data, out.data(), out.size(), session.user_data, info->record_callback_args);
}
}
@@ -436,9 +464,11 @@ start_context(const context::context* ctx)
[=](const hsa::Queue& q,
const hsa::rocprofiler_packet& kern_pkt,
uint64_t kernel_id,
rocprofiler_user_data_t* user_data,
const hsa::Queue::queue_info_session_t::external_corr_id_map_t& extern_corr_ids,
const context::correlation_id* correlation_id) {
return queue_cb(cb, q, kern_pkt, kernel_id, extern_corr_ids, correlation_id);
return queue_cb(
cb, q, kern_pkt, kernel_id, user_data, extern_corr_ids, correlation_id);
},
// Completion CB
[=](const hsa::Queue& q,
@@ -24,6 +24,7 @@
#include <rocprofiler-sdk/agent.h>
#include <rocprofiler-sdk/dispatch_profile.h>
#include <rocprofiler-sdk/fwd.h>
#include "lib/rocprofiler-sdk/aql/helpers.hpp"
#include "lib/rocprofiler-sdk/aql/packet_construct.hpp"
@@ -80,7 +81,6 @@ struct counter_callback_info
// HSA Queue ClientID. This is an ID we get when we insert a callback into the
// HSA queue interceptor. This ID can be used to disable the callback.
rocprofiler::hsa::ClientID queue_id{-1};
// Buffer to use for storing counter data. Used if callback is not set.
std::optional<rocprofiler_buffer_id_t> buffer;
@@ -130,6 +130,7 @@ queue_cb(const std::shared_ptr<counter_callback_info>& info,
const hsa::Queue& queue,
const hsa::rocprofiler_packet& pkt,
uint64_t kernel_id,
rocprofiler_user_data_t* user_data,
const hsa::Queue::queue_info_session_t::external_corr_id_map_t& extern_corr_ids,
const context::correlation_id* correlation_id);
@@ -20,10 +20,23 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <algorithm>
#include <cstdint>
#include <sstream>
#include <tuple>
#include "lib/rocprofiler-sdk/counters/core.hpp"
#include "lib/common/static_object.hpp"
#include "lib/common/utility.hpp"
#include "lib/rocprofiler-sdk/agent.hpp"
#include "lib/rocprofiler-sdk/buffer.hpp"
#include "lib/rocprofiler-sdk/context/context.hpp"
#include "lib/rocprofiler-sdk/counters/id_decode.hpp"
#include "lib/rocprofiler-sdk/counters/metrics.hpp"
#include "lib/rocprofiler-sdk/hsa/agent_cache.hpp"
#include "lib/rocprofiler-sdk/hsa/queue.hpp"
#include "lib/rocprofiler-sdk/hsa/queue_controller.hpp"
#include "lib/rocprofiler-sdk/registration.hpp"
#include <rocprofiler-sdk/dispatch_profile.h>
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/registration.h>
#include <rocprofiler-sdk/rocprofiler.h>
#include <fmt/core.h>
#include <gtest/gtest.h>
@@ -31,21 +44,10 @@
#include <hsa/hsa_api_trace.h>
#include <hsa/hsa_ext_amd.h>
#include <rocprofiler-sdk/rocprofiler.h>
#include "lib/common/static_object.hpp"
#include "lib/common/utility.hpp"
#include "lib/rocprofiler-sdk/agent.hpp"
#include "lib/rocprofiler-sdk/buffer.hpp"
#include "lib/rocprofiler-sdk/context/context.hpp"
#include "lib/rocprofiler-sdk/counters/core.hpp"
#include "lib/rocprofiler-sdk/counters/id_decode.hpp"
#include "lib/rocprofiler-sdk/counters/metrics.hpp"
#include "lib/rocprofiler-sdk/hsa/agent_cache.hpp"
#include "lib/rocprofiler-sdk/hsa/queue.hpp"
#include "lib/rocprofiler-sdk/hsa/queue_controller.hpp"
#include "lib/rocprofiler-sdk/registration.hpp"
#include "rocprofiler-sdk/registration.h"
#include <algorithm>
#include <cstdint>
#include <sstream>
#include <tuple>
using namespace rocprofiler::counters;
using namespace rocprofiler;
@@ -204,13 +206,10 @@ buffered_callback(rocprofiler_context_id_t,
}
void
null_dispatch_callback(rocprofiler_queue_id_t,
const rocprofiler_agent_t*,
rocprofiler_correlation_id_t,
const hsa_kernel_dispatch_packet_t*,
uint64_t,
void*,
rocprofiler_profile_config_id_t*)
null_dispatch_callback(rocprofiler_profile_counting_dispatch_data_t,
rocprofiler_profile_config_id_t*,
rocprofiler_user_data_t*,
void*)
{}
void
@@ -223,13 +222,11 @@ null_buffered_callback(rocprofiler_context_id_t,
{}
void
null_record_callback(rocprofiler_queue_id_t,
rocprofiler_agent_id_t,
rocprofiler_correlation_id_t,
uint64_t,
void*,
null_record_callback(rocprofiler_profile_counting_dispatch_data_t,
rocprofiler_record_counter_t*,
size_t,
rocprofiler_record_counter_t*)
rocprofiler_user_data_t,
void*)
{}
} // namespace
@@ -326,10 +323,10 @@ public:
, _agent(a)
, _id(id)
{}
virtual const AgentCache& get_agent() const override final { return _agent; };
virtual rocprofiler_queue_id_t get_id() const override final { return _id; };
const AgentCache& get_agent() const final { return _agent; };
rocprofiler_queue_id_t get_id() const final { return _id; };
~FakeQueue() {}
~FakeQueue() override = default;
private:
const AgentCache& _agent;
@@ -339,39 +336,60 @@ private:
} // namespace hsa
} // namespace rocprofiler
bool
operator==(rocprofiler_dim3_t lhs, rocprofiler_dim3_t rhs)
{
return std::tie(lhs.x, lhs.y, lhs.z) == std::tie(rhs.x, rhs.y, rhs.z);
}
bool
operator==(rocprofiler_agent_id_t lhs, rocprofiler_agent_id_t rhs)
{
return (lhs.handle == rhs.handle);
}
namespace
{
struct expected_dispatch
{
// To pass back
rocprofiler_profile_config_id_t id;
rocprofiler_queue_id_t queue_id;
const rocprofiler_agent_t* agent;
rocprofiler_correlation_id_t correlation_id;
hsa_kernel_dispatch_packet_t* dispatch_packet;
uint64_t kernel_id;
rocprofiler_profile_config_id_t* config;
rocprofiler_profile_config_id_t id = {};
rocprofiler_queue_id_t queue_id = {.handle = 0};
rocprofiler_agent_id_t agent_id = {.handle = 0};
uint64_t kernel_id = 0;
rocprofiler_correlation_id_t correlation_id = {.internal = 0, .external = {.value = 0}};
rocprofiler_dim3_t workgroup_size = {0, 0, 0};
rocprofiler_dim3_t grid_size = {0, 0, 0};
rocprofiler_profile_config_id_t* config = nullptr;
};
void
user_dispatch_cb(rocprofiler_queue_id_t queue_id,
const rocprofiler_agent_t* agent,
rocprofiler_correlation_id_t correlation_id,
const hsa_kernel_dispatch_packet_t* dispatch_packet,
uint64_t kernel_id,
void* callback_data_args,
rocprofiler_profile_config_id_t* config)
user_dispatch_cb(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
rocprofiler_profile_config_id_t* config,
rocprofiler_user_data_t* user_data,
void* callback_data_args)
{
expected_dispatch& expected = *static_cast<expected_dispatch*>(callback_data_args);
ASSERT_EQ(expected.agent, agent);
ASSERT_EQ(expected.queue_id.handle, queue_id.handle);
ASSERT_EQ(expected.correlation_id.internal, correlation_id.internal);
ASSERT_EQ(expected.correlation_id.external.ptr, correlation_id.external.ptr);
ASSERT_EQ(expected.correlation_id.external.value, correlation_id.external.value);
ASSERT_EQ(expected.dispatch_packet, dispatch_packet);
ASSERT_EQ(expected.kernel_id, kernel_id);
auto agent_id = dispatch_data.agent_id;
auto queue_id = dispatch_data.queue_id;
auto correlation_id = dispatch_data.correlation_id;
auto kernel_id = dispatch_data.kernel_id;
EXPECT_EQ(sizeof(rocprofiler_profile_counting_dispatch_data_t), dispatch_data.size);
EXPECT_EQ(expected.kernel_id, kernel_id);
EXPECT_EQ(expected.agent_id, agent_id);
EXPECT_EQ(expected.queue_id.handle, queue_id.handle);
EXPECT_EQ(expected.correlation_id.internal, correlation_id.internal);
EXPECT_EQ(expected.correlation_id.external.ptr, correlation_id.external.ptr);
EXPECT_EQ(expected.correlation_id.external.value, correlation_id.external.value);
EXPECT_EQ(expected.workgroup_size, dispatch_data.workgroup_size);
EXPECT_EQ(expected.grid_size, dispatch_data.grid_size);
ASSERT_NE(config, nullptr);
config->handle = expected.id.handle;
(void) user_data;
}
} // namespace
@@ -440,17 +458,22 @@ TEST(core, check_callbacks)
hsa::rocprofiler_packet pkt;
pkt.ext_amd_aql_pm4.header = count++;
expected.correlation_id = {.internal = corr_id.internal,
expected.correlation_id = {.internal = corr_id.internal,
.external = context::null_user_data};
expected.dispatch_packet = &pkt.kernel_dispatch;
expected.kernel_id = count++;
expected.queue_id = qid;
expected.agent = fq.get_agent().get_rocp_agent();
expected.workgroup_size = {pkt.kernel_dispatch.workgroup_size_x,
pkt.kernel_dispatch.workgroup_size_y,
pkt.kernel_dispatch.workgroup_size_z};
expected.grid_size = {pkt.kernel_dispatch.grid_size_x,
pkt.kernel_dispatch.grid_size_y,
pkt.kernel_dispatch.grid_size_z};
expected.kernel_id = count++;
expected.queue_id = qid;
expected.agent_id = fq.get_agent().get_rocp_agent()->id;
hsa::Queue::queue_info_session_t::external_corr_id_map_t extern_ids = {};
auto ret_pkt =
counters::queue_cb(cb_info, fq, pkt, expected.kernel_id, extern_ids, &corr_id);
auto user_data = rocprofiler_user_data_t{.value = corr_id.internal};
auto ret_pkt = counters::queue_cb(
cb_info, fq, pkt, expected.kernel_id, &user_data, extern_ids, &corr_id);
ASSERT_TRUE(ret_pkt) << fmt::format("Expected a packet to be generated for - {}",
metric.name());
@@ -20,14 +20,6 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <gtest/gtest.h>
#include <fmt/core.h>
#include <hsa/hsa.h>
#include <hsa/hsa_api_trace.h>
#include <hsa/hsa_ext_amd.h>
#include <rocprofiler-sdk/rocprofiler.h>
#include "lib/common/static_object.hpp"
#include "lib/common/utility.hpp"
#include "lib/rocprofiler-sdk/agent.hpp"
@@ -42,7 +34,15 @@
#include "lib/rocprofiler-sdk/hsa/queue.hpp"
#include "lib/rocprofiler-sdk/hsa/queue_controller.hpp"
#include "lib/rocprofiler-sdk/registration.hpp"
#include "rocprofiler-sdk/registration.h"
#include <rocprofiler-sdk/registration.h>
#include <rocprofiler-sdk/rocprofiler.h>
#include <fmt/core.h>
#include <gtest/gtest.h>
#include <hsa/hsa.h>
#include <hsa/hsa_api_trace.h>
#include <hsa/hsa_ext_amd.h>
namespace
{
@@ -20,16 +20,6 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <algorithm>
#include <cstdint>
#include <sstream>
#include <tuple>
#include <fmt/core.h>
#include <gtest/gtest.h>
#include <rocprofiler-sdk/rocprofiler.h>
#include "lib/common/static_object.hpp"
#include "lib/common/utility.hpp"
#include "lib/rocprofiler-sdk/buffer.hpp"
@@ -37,7 +27,18 @@
#include "lib/rocprofiler-sdk/counters/id_decode.hpp"
#include "lib/rocprofiler-sdk/counters/metrics.hpp"
#include "lib/rocprofiler-sdk/registration.hpp"
#include "rocprofiler-sdk/registration.h"
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/registration.h>
#include <rocprofiler-sdk/rocprofiler.h>
#include <fmt/core.h>
#include <gtest/gtest.h>
#include <algorithm>
#include <cstdint>
#include <sstream>
#include <tuple>
using namespace rocprofiler::counters;
@@ -125,13 +126,10 @@ buffered_callback(rocprofiler_context_id_t,
{}
void
dispatch_callback(rocprofiler_queue_id_t,
const rocprofiler_agent_t*,
rocprofiler_correlation_id_t,
const hsa_kernel_dispatch_packet_t*,
uint64_t,
void*,
rocprofiler_profile_config_id_t*)
dispatch_callback(rocprofiler_profile_counting_dispatch_data_t,
rocprofiler_profile_config_id_t*,
rocprofiler_user_data_t*,
void*)
{}
rocprofiler_context_id_t&
@@ -166,12 +166,131 @@ get_names()
namespace
{
using hsa_loader_table_t = hsa_ven_amd_loader_1_01_pfn_t;
using context_t = context::context;
using user_data_t = rocprofiler_user_data_t;
using context_array_t = context::context_array_t;
using context_user_data_map_t = std::unordered_map<const context_t*, user_data_t>;
using name_array_t = std::vector<std::pair<size_t, std::unique_ptr<std::string>>>;
using hsa_loader_table_t = hsa_ven_amd_loader_1_01_pfn_t;
using context_t = context::context;
using user_data_t = rocprofiler_user_data_t;
using context_array_t = context::context_array_t;
using context_user_data_map_t = std::unordered_map<const context_t*, user_data_t>;
using name_array_t = std::vector<std::pair<size_t, std::unique_ptr<std::string>>>;
using amd_compute_pgm_rsrc_three32_t = uint32_t;
struct kernel_descriptor_t
{
uint8_t reserved0[16];
int64_t kernel_code_entry_byte_offset;
uint8_t reserved1[20];
uint32_t compute_pgm_rsrc3;
uint32_t compute_pgm_rsrc1;
uint32_t compute_pgm_rsrc2;
uint16_t kernel_code_properties;
uint8_t reserved2[6];
};
// AMD Compute Program Resource Register Three.
enum amd_compute_gfx9_pgm_rsrc_three_t
{
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_ACCUM_OFFSET, 0, 5),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_TG_SPLIT, 16, 1)
};
enum amd_compute_gfx10_gfx11_pgm_rsrc_three_t
{
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_SHARED_VGPR_COUNT, 0, 4),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_INST_PREF_SIZE, 4, 6),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_TRAP_ON_START, 10, 1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_TRAP_ON_END, 11, 1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_IMAGE_OP, 31, 1)
};
// Kernel code properties.
enum amd_kernel_code_property_t
{
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER,
0,
1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, 1, 1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, 2, 1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR,
3,
1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, 4, 1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
6,
1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_RESERVED0, 7, 3),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
10,
1), // GFX10+
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK, 11, 1),
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_RESERVED1, 12, 4),
};
uint32_t
arch_vgpr_count(std::string_view name, kernel_descriptor_t kernel_code)
{
if(name == "gfx90a" || name.find("gfx94") == 0)
return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc3,
AMD_COMPUTE_PGM_RSRC_THREE_ACCUM_OFFSET) +
1) *
4;
return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1,
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT) +
1) *
(AMD_HSA_BITS_GET(kernel_code.kernel_code_properties,
AMD_KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32)
? 8
: 4);
}
uint32_t
accum_vgpr_count(std::string_view name, kernel_descriptor_t kernel_code)
{
if(name == "gfx908")
return arch_vgpr_count(name, kernel_code);
else if(name == "gfx90a" || name.find("gfx94") == 0)
return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1,
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT) +
1) *
(8 - arch_vgpr_count(name, kernel_code));
LOG(WARNING) << "Missing support for accum_vgpr_count for " << name;
return 0;
}
uint32_t
sgpr_count(std::string_view name, kernel_descriptor_t kernel_code)
{
// GFX10 and later always allocate 128 sgprs.
constexpr uint32_t gfx10_sgprs = 128;
auto begp = name.find_first_of("0123456789");
if(!name.empty() && begp != std::string_view::npos)
{
auto endp = name.find_first_not_of("0123456789", begp);
auto lenp = (endp - begp) + 1;
auto gfxip_str = name.substr(begp, lenp);
auto gfxip_n = int32_t{0};
if(!gfxip_str.empty()) gfxip_n = std::stoi(std::string{gfxip_str});
if(gfxip_n >= 1000)
{
return gfx10_sgprs;
}
else
{
return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1,
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT) /
2 +
1) *
16;
}
}
LOG(WARNING) << "Missing support for sgpr_count for " << name;
return 0;
}
name_array_t*
get_string_array()
@@ -212,6 +331,41 @@ get_loader_table()
return _v;
}
auto*&
get_status_string_function()
{
static decltype(::hsa_status_string)* _v = nullptr;
return _v;
}
std::string_view
get_status_string(hsa_status_t _status)
{
const char* _msg = nullptr;
if(get_status_string_function() &&
get_status_string_function()(_status, &_msg) == HSA_STATUS_SUCCESS && _msg)
return std::string_view{_msg};
return std::string_view{"(unknown HSA error)"};
}
const kernel_descriptor_t*
get_kernel_descriptor(uint64_t kernel_object)
{
const kernel_descriptor_t* kernel_code = nullptr;
if(get_loader_table().hsa_ven_amd_loader_query_host_address == nullptr) return kernel_code;
hsa_status_t status = get_loader_table().hsa_ven_amd_loader_query_host_address(
reinterpret_cast<const void*>(kernel_object), // NOLINT(performance-no-int-to-ptr)
reinterpret_cast<const void**>(&kernel_code));
if(status == HSA_STATUS_SUCCESS) return kernel_code;
LOG(WARNING) << "hsa_ven_amd_loader_query_host_address(kernel_object=" << kernel_object
<< ") returned " << status << ": " << get_status_string(status);
// NOLINTNEXTLINE(performance-no-int-to-ptr)
return reinterpret_cast<kernel_descriptor_t*>(kernel_object);
}
struct kernel_symbol
{
using kernel_symbol_data_t =
@@ -441,6 +595,19 @@ executable_iterate_agent_symbols_load_callback(hsa_executable_t executabl
ROCP_HSA_CORE_GET_EXE_SYMBOL_INFO(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
&data.private_segment_size);
// This works for gfx9 but may not for Navi arch
const auto* kernel_descript = get_kernel_descriptor(data.kernel_object);
if(CHECK_NOTNULL(code_obj_v) && CHECK_NOTNULL(kernel_descript))
{
const auto* rocp_agent = agent::get_agent(code_obj_v->rocp_data.rocp_agent);
if(CHECK_NOTNULL(rocp_agent))
{
data.arch_vgpr_count = arch_vgpr_count(rocp_agent->name, *kernel_descript);
data.accum_vgpr_count = accum_vgpr_count(rocp_agent->name, *kernel_descript);
data.sgpr_count = sgpr_count(rocp_agent->name, *kernel_descript);
}
}
// if we have reached this point (i.e. there were no HSA errors returned within macro) then we
// generate a unique kernel symbol id
data.kernel_id = ++get_kernel_symbol_id();
@@ -905,10 +1072,13 @@ code_object_init(HsaApiTable* table)
{
auto& core_table = *table->core_;
get_status_string_function() = core_table.hsa_status_string_fn;
auto _status = core_table.hsa_system_get_major_extension_table_fn(
HSA_EXTENSION_AMD_LOADER, 1, sizeof(hsa_loader_table_t), &get_loader_table());
LOG_IF(ERROR, _status != HSA_STATUS_SUCCESS) << "hsa_system_get_major_extension_table failed";
LOG_IF(ERROR, _status != HSA_STATUS_SUCCESS)
<< "hsa_system_get_major_extension_table failed: " << get_status_string(_status);
if(_status == HSA_STATUS_SUCCESS)
{
@@ -253,8 +253,9 @@ WriteInterceptor(const void* packets,
return;
}
auto thr_id = common::get_tid();
auto* corr_id = context::get_latest_correlation_id();
auto thr_id = common::get_tid();
auto* corr_id = context::get_latest_correlation_id();
auto user_data = rocprofiler_user_data_t{.value = 0};
// use thread-local value to reuse allocation
auto extern_corr_ids = Queue::queue_info_session_t::external_corr_id_map_t{};
@@ -300,8 +301,8 @@ WriteInterceptor(const void* packets,
queue.signal_callback([&](const auto& map) {
for(const auto& [client_id, cb_pair] : map)
{
if(auto maybe_pkt =
cb_pair.first(queue, kernel_pkt, kernel_id, extern_corr_ids, corr_id))
if(auto maybe_pkt = cb_pair.first(
queue, kernel_pkt, kernel_id, &user_data, extern_corr_ids, corr_id))
{
inst_pkt.push_back(std::make_pair(std::move(maybe_pkt), client_id));
}
@@ -381,6 +382,7 @@ WriteInterceptor(const void* packets,
.tid = thr_id,
.kernel_id = kernel_id,
.queue_id = queue.get_id(),
.user_data = user_data,
.hsa_agent = queue.get_agent().get_hsa_agent(),
.rocp_agent = queue.get_agent().get_rocp_agent(),
.correlation_id = corr_id,
@@ -122,6 +122,7 @@ public:
rocprofiler_thread_id_t tid = common::get_tid();
rocprofiler_kernel_id_t kernel_id = 0;
rocprofiler_queue_id_t queue_id = {};
rocprofiler_user_data_t user_data = {.value = 0};
hsa_agent_t hsa_agent = {};
const rocprofiler_agent_t* rocp_agent = nullptr;
context::correlation_id* correlation_id = nullptr;
@@ -137,6 +138,7 @@ public:
const Queue&,
const rocprofiler_packet&,
uint64_t,
rocprofiler_user_data_t*,
const queue_info_session_t::external_corr_id_map_t&,
const context::correlation_id*)>;
// Signals the completion of the kernel packet.
@@ -220,4 +222,4 @@ Queue::signal_callback(FuncT&& func) const
}
} // namespace hsa
} // namespace rocprofiler
} // namespace rocprofiler
@@ -20,7 +20,7 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <glog/logging.h>
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/rocprofiler.h>
#include "lib/common/synchronized.hpp"
@@ -31,7 +31,8 @@
#include "lib/rocprofiler-sdk/counters/evaluate_ast.hpp"
#include "lib/rocprofiler-sdk/counters/metrics.hpp"
#include "lib/rocprofiler-sdk/hsa/agent_cache.hpp"
#include "rocprofiler-sdk/fwd.h"
#include <glog/logging.h>
extern "C" {
/**
@@ -235,14 +235,15 @@ find_clients()
{
for(const auto& itr : env)
{
LOG(INFO) << "searching " << itr << " for rocprofiler_configure";
LOG(INFO) << "[env] searching " << itr << " for rocprofiler_configure";
void* handle = dlopen(itr.c_str(), RTLD_NOLOAD | RTLD_LAZY);
if(!handle)
{
LOG(INFO) << itr << " is not already loaded, doing a global lazy dlopen...";
handle = dlopen(itr.c_str(), RTLD_GLOBAL | RTLD_LAZY);
LOG(WARNING) << "[env] " << itr
<< " is not already loaded, doing a local lazy dlopen...";
handle = dlopen(itr.c_str(), RTLD_LOCAL | RTLD_LAZY);
}
if(!handle)
@@ -0,0 +1,84 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "lib/common/environment.hpp"
#include "lib/common/logging.hpp"
#include "lib/common/static_object.hpp"
#include "lib/rocprofiler-sdk/allocator.hpp"
#include "lib/rocprofiler-sdk/registration.hpp"
#include <iostream>
namespace rocprofiler
{
namespace shared_library
{
namespace
{
struct lifetime
{
lifetime();
~lifetime();
};
lifetime::lifetime()
{
registration::init_logging();
if(common::get_env("ROCPROFILER_LIBRARY_CTOR", false))
{
LOG(INFO) << "Initializing rocprofiler-sdk library...";
registration::initialize();
LOG(INFO) << "rocprofiler-sdk library initialized";
}
}
lifetime::~lifetime()
{
if(common::get_env("ROCPROFILER_LIBRARY_DTOR", false))
{
LOG(INFO) << "Finalizing rocprofiler-sdk library...";
registration::finalize();
LOG(INFO) << "rocprofiler-sdk library finalized";
}
}
auto*&
get_lifetime()
{
static auto* _v = common::static_object<lifetime>::construct();
return _v;
}
} // namespace
} // namespace shared_library
auto rocprofiler_sdk_shlib_lifetime = shared_library::get_lifetime();
void
rocprofiler_sdk_shlib_ctor() ROCPROFILER_ATTRIBUTE(constructor(101));
void
rocprofiler_sdk_shlib_ctor()
{
(void) shared_library::get_lifetime();
}
} // namespace rocprofiler
@@ -22,11 +22,11 @@
#pragma once
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/registration.h>
#include <rocprofiler-sdk/rocprofiler.h>
#include "lib/common/defines.hpp"
#include "rocprofiler-sdk/fwd.h"
#include <gtest/gtest.h>
@@ -20,10 +20,14 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <hsa/hsa.h>
#include <rocprofiler-sdk-roctx/api_trace.h>
#include <rocprofiler-sdk-roctx/roctx.h>
#include <rocprofiler-sdk-roctx/types.h>
#include <rocprofiler-sdk/buffer.h>
#include <rocprofiler-sdk/callback_tracing.h>
#include <rocprofiler-sdk/context.h>
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/marker/api_id.h>
#include <rocprofiler-sdk/registration.h>
#include <rocprofiler-sdk/rocprofiler.h>
@@ -33,13 +37,9 @@
#include "lib/common/units.hpp"
#include "lib/common/utility.hpp"
#include "lib/rocprofiler-sdk/tests/common.hpp"
#include "rocprofiler-sdk-roctx/api_trace.h"
#include "rocprofiler-sdk-roctx/types.h"
#include "rocprofiler-sdk/callback_tracing.h"
#include "rocprofiler-sdk/context.h"
#include "rocprofiler-sdk/marker/api_id.h"
#include <gtest/gtest.h>
#include <hsa/hsa.h>
#include <dlfcn.h>
#include <pthread.h>
@@ -40,8 +40,11 @@ add_subdirectory(common)
# tool libraries used for data collection during integration tests
add_subdirectory(tools)
# libraries used by integration test applications
add_subdirectory(lib)
# applications used by integration tests
add_subdirectory(apps)
add_subdirectory(bin)
# validation tests
add_subdirectory(kernel-tracing)
@@ -29,7 +29,8 @@ set(async-copy-tracing-env
set_tests_properties(
test-async-copy-tracing-execute
PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT
"${async-copy-tracing-env}" FAIL_REGULAR_EXPRESSION "threw an exception")
"${async-copy-tracing-env}" FAIL_REGULAR_EXPRESSION
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
foreach(FILENAME validate.py pytest.ini conftest.py)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}
@@ -44,4 +45,4 @@ set_tests_properties(
test-async-copy-tracing-validate
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS
test-async-copy-tracing-execute FAIL_REGULAR_EXPRESSION
"threw an exception")
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
@@ -3,7 +3,7 @@
#
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
project(rocprofiler-test-apps LANGUAGES C CXX)
project(rocprofiler-tests-bin LANGUAGES C CXX)
set(CMAKE_BUILD_RPATH "\$ORIGIN:\$ORIGIN/../lib")
@@ -11,6 +11,7 @@ set(CMAKE_BUILD_RPATH "\$ORIGIN:\$ORIGIN/../lib")
add_subdirectory(simple-transpose)
add_subdirectory(multistream)
add_subdirectory(vector-operations)
add_subdirectory(hip-in-libraries)
set(CMAKE_BUILD_RPATH
"\$ORIGIN:\$ORIGIN/../lib:$<TARGET_FILE_DIR:rocprofiler-sdk-roctx::rocprofiler-sdk-roctx-shared-library>"
@@ -0,0 +1,34 @@
#
#
#
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
project(rocprofiler-tests-bin-hip-in-libraries LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
add_executable(hip-in-libraries)
target_sources(hip-in-libraries PRIVATE hip-in-libraries.cpp)
target_compile_options(hip-in-libraries PRIVATE -W -Wall -Wextra -Wpedantic -Wshadow
-Werror)
target_link_libraries(hip-in-libraries PRIVATE transpose-shared-library
vector-ops-shared-library)
find_package(hip REQUIRED)
target_link_libraries(hip-in-libraries PRIVATE hip::host)
find_package(Threads REQUIRED)
target_link_libraries(hip-in-libraries PRIVATE Threads::Threads)
if(TRANSPOSE_USE_MPI)
find_package(MPI REQUIRED)
target_compile_definitions(hip-in-libraries PRIVATE USE_MPI)
target_link_libraries(hip-in-libraries PRIVATE MPI::MPI_C)
endif()
install(
TARGETS hip-in-libraries
DESTINATION bin
COMPONENT tests)
@@ -0,0 +1,151 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include "transpose.hpp"
#include "vector-ops.hpp"
#include <hip/hip_runtime_api.h>
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <mutex>
#include <stdexcept>
#include <thread>
#if defined(USE_MPI)
# include <mpi.h>
#endif
#define HIP_API_CALL(CALL) \
{ \
hipError_t error_ = (CALL); \
if(error_ != hipSuccess) \
{ \
auto _hip_api_print_lk = auto_lock_t{print_lock}; \
fprintf(stderr, \
"%s:%d :: HIP error : %s\n", \
__FILE__, \
__LINE__, \
hipGetErrorString(error_)); \
throw std::runtime_error("hip_api_call"); \
} \
}
namespace
{
using auto_lock_t = std::unique_lock<std::mutex>;
auto print_lock = std::mutex{};
size_t nqueues = 8;
size_t nthreads = 4;
size_t nitr = 500;
size_t nsync = 10;
} // namespace
int
main(int argc, char** argv)
{
int rank = 0;
int size = 1;
#if defined(USE_MPI)
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
#else
(void) size;
#endif
for(int i = 1; i < argc; ++i)
{
auto _arg = std::string{argv[i]};
if(_arg == "?" || _arg == "-h" || _arg == "--help")
{
if(rank == 0)
{
fprintf(stderr,
"usage: hip-in-libraries [NUM_QUEUES (%zu)] [NUM_THREADS (%zu)] "
"[NUM_ITERATION (%zu)] "
"[SYNC_EVERY_N_ITERATIONS (%zu)]\n",
nqueues,
nthreads,
nitr,
nsync);
}
exit(EXIT_SUCCESS);
}
}
if(argc > 1) nqueues = atoll(argv[1]);
if(argc > 2) nthreads = atoll(argv[2]);
if(argc > 3) nitr = atoll(argv[3]);
if(argc > 4) nsync = atoll(argv[4]);
int ndevice = 0;
HIP_API_CALL(hipGetDeviceCount(&ndevice));
printf("[hip-in-libraries] Number of devices found: %i\n", ndevice);
printf("[hip-in-libraries] Number of queues: %zu\n", nqueues);
printf("[hip-in-libraries] Number of threads: %zu\n", nthreads);
printf("[hip-in-libraries] Number of iterations: %zu\n", nitr);
printf("[hip-in-libraries] Syncing every %zu iterations\n", nsync);
{
auto vector_ops_thread = std::thread{run_vector_ops, nthreads, nqueues};
auto transpose_thread = std::thread{run_transpose, nthreads, nitr, nsync};
vector_ops_thread.join();
transpose_thread.join();
}
// this is a temporary workaround in omnitrace when HIP + MPI is enabled
#if defined(USE_MPI)
MPI_Barrier(MPI_COMM_WORLD);
#endif
for(int i = 0; i < ndevice; ++i)
{
HIP_API_CALL(hipSetDevice(i));
HIP_API_CALL(hipDeviceSynchronize());
}
#if defined(USE_MPI)
MPI_Barrier(MPI_COMM_WORLD);
#endif
if(rank == 0)
{
for(int i = 0; i < ndevice; ++i)
{
HIP_API_CALL(hipSetDevice(i));
HIP_API_CALL(hipDeviceReset());
}
}
#if defined(USE_MPI)
MPI_Barrier(MPI_COMM_WORLD);
#endif
return 0;
}
@@ -17,7 +17,7 @@ if(NOT CMAKE_HIP_COMPILER)
endif()
endif()
project(rocprofiler-test-app-multistream LANGUAGES CXX HIP)
project(rocprofiler-tests-bin-multistream LANGUAGES CXX HIP)
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
@@ -17,7 +17,7 @@ if(NOT CMAKE_HIP_COMPILER)
endif()
endif()
project(rocprofiler-test-app-reproducible-runtime LANGUAGES CXX HIP)
project(rocprofiler-tests-bin-reproducible-runtime LANGUAGES CXX HIP)
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
@@ -17,7 +17,7 @@ if(NOT CMAKE_HIP_COMPILER)
endif()
endif()
project(rocprofiler-tool-test-app-transpose LANGUAGES CXX HIP)
project(rocprofiler-tests-bin-transpose LANGUAGES CXX HIP)
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
@@ -17,7 +17,7 @@ if(NOT CMAKE_HIP_COMPILER)
endif()
endif()
project(rocprofiler-test-app-transpose LANGUAGES CXX HIP)
project(rocprofiler-tests-bin-transpose LANGUAGES CXX HIP)
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
@@ -189,7 +189,7 @@ run(int rank, int tid, int devid, int argc, char** argv)
HIP_API_CALL(hipStreamCreate(&stream));
auto_lock_t _lk{print_lock};
std::cout << "[" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl;
std::cout << "[transpose][" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl;
_lk.unlock();
std::default_random_engine _engine{std::random_device{}() * (rank + 1) * (tid + 1)};
@@ -230,8 +230,10 @@ run(int rank, int tid, int devid, int argc, char** argv)
float GB = (float) size * nitr * 2 / (1 << 30);
print_lock.lock();
std::cout << "[" << rank << "][" << tid << "] Runtime of transpose is " << time << " sec\n"
<< "The average performance of transpose is " << GB / time << " GBytes/sec"
std::cout << "[transpose][" << rank << "][" << tid << "] Runtime of transpose is " << time
<< " sec\n";
std::cout << "[transpose][" << rank << "][" << tid
<< "] The average performance of transpose is " << GB / time << " GBytes/sec"
<< std::endl;
print_lock.unlock();
@@ -17,7 +17,7 @@ if(NOT CMAKE_HIP_COMPILER)
endif()
endif()
project(rocprofiler-tool-test-app-transpose LANGUAGES CXX HIP)
project(rocprofiler-tests-bin-vector-operations LANGUAGES CXX HIP)
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
@@ -1,24 +1,25 @@
/*
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <assert.h>
#include <hip/hip_runtime.h>
#include <stdio.h>
@@ -132,8 +133,12 @@ divide_kernel(float* __restrict__ a,
using namespace std;
void
run(int NUM_QUEUE)
run(int NUM_QUEUE, int DEVICE_ID)
{
HIP_API_CALL(hipSetDevice(DEVICE_ID));
HIP_API_CALL(hipDeviceSynchronize());
std::vector<float*> hostA(NUM_QUEUE);
std::vector<float*> hostB(NUM_QUEUE);
std::vector<float*> hostC(NUM_QUEUE);
@@ -144,10 +149,18 @@ run(int NUM_QUEUE)
std::vector<hipStream_t> streams(NUM_QUEUE);
hipDeviceProp_t devProp;
HIP_API_CALL(hipGetDeviceProperties(&devProp, 0));
auto sync_stream = [NUM_QUEUE, streams](int q) {
if(q < 0 || q >= NUM_QUEUE)
throw std::runtime_error{std::string{"invalid stream id: "} + std::to_string(q)};
int i;
HIP_API_CALL(hipStreamSynchronize(streams.at(q)));
};
auto sync_streams = [NUM_QUEUE, sync_stream]() {
for(int i = 0; i < NUM_QUEUE; ++i)
sync_stream(i);
HIP_API_CALL(hipDeviceSynchronize());
};
for(int q = 0; q < NUM_QUEUE; q++)
{
@@ -158,26 +171,26 @@ run(int NUM_QUEUE)
HIP_API_CALL(hipHostMalloc(&hostC[q], NUM * sizeof(float), 0));
// initialize the input data
for(i = 0; i < NUM; i++)
for(int i = 0; i < NUM; i++)
{
hostB[q][i] = (float) i;
hostC[q][i] = (float) i * 100.0f;
hostB[q][i] = static_cast<float>(i);
hostC[q][i] = static_cast<float>(i * 100.0f);
}
HIP_API_CALL(hipMalloc((void**) (&deviceA[q]), NUM * sizeof(float)));
HIP_API_CALL(hipMalloc((void**) (&deviceB[q]), NUM * sizeof(float)));
HIP_API_CALL(hipMalloc((void**) (&deviceC[q]), NUM * sizeof(float)));
HIP_API_CALL(hipMallocAsync(&deviceA[q], NUM * sizeof(float), streams[q]));
HIP_API_CALL(hipMallocAsync(&deviceB[q], NUM * sizeof(float), streams[q]));
HIP_API_CALL(hipMallocAsync(&deviceC[q], NUM * sizeof(float), streams[q]));
HIP_API_CALL(hipMemcpyAsync(
deviceB[q], hostB[q], NUM * sizeof(float), hipMemcpyHostToDevice, streams[q]));
HIP_API_CALL(hipMemcpyAsync(
deviceC[q], hostC[q], NUM * sizeof(float), hipMemcpyHostToDevice, streams[q]));
}
HIP_API_CALL(hipDeviceSynchronize());
for(int RUN_I = 0; RUN_I < 2; RUN_I++)
sync_streams();
for(int q = 0; q < NUM_QUEUE; q++)
{
int q = (4 * RUN_I + 0) % NUM_QUEUE;
hipLaunchKernelGGL(addition_kernel,
dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
@@ -189,8 +202,8 @@ run(int NUM_QUEUE)
WIDTH,
HEIGHT);
HIP_API_CALL(hipDeviceSynchronize());
q = (4 * RUN_I + 1) % NUM_QUEUE;
HIP_API_CALL(hipGetLastError());
hipLaunchKernelGGL(subtract_kernel,
dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
@@ -202,8 +215,8 @@ run(int NUM_QUEUE)
WIDTH,
HEIGHT);
HIP_API_CALL(hipDeviceSynchronize());
q = (4 * RUN_I + 2) % NUM_QUEUE;
HIP_API_CALL(hipGetLastError());
hipLaunchKernelGGL(multiply_kernel,
dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
@@ -215,8 +228,8 @@ run(int NUM_QUEUE)
WIDTH,
HEIGHT);
HIP_API_CALL(hipDeviceSynchronize());
q = (4 * RUN_I + 3) % NUM_QUEUE;
HIP_API_CALL(hipGetLastError());
hipLaunchKernelGGL(divide_kernel,
dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
@@ -227,17 +240,18 @@ run(int NUM_QUEUE)
deviceC[q],
WIDTH,
HEIGHT);
HIP_API_CALL(hipDeviceSynchronize());
HIP_API_CALL(hipGetLastError());
}
for(int q = 0; q < NUM_QUEUE; q++)
HIP_API_CALL(hipMemcpyAsync(
hostA[q], deviceA[q], NUM * sizeof(float), hipMemcpyDeviceToHost, streams[q]));
sync_streams();
for(int q = 0; q < NUM_QUEUE; q++)
{
HIP_API_CALL(hipMemcpy(hostA[q], deviceA[q], NUM * sizeof(float), hipMemcpyDeviceToHost));
HIP_API_CALL(hipDeviceSynchronize());
HIP_API_CALL(hipMemcpyAsync(
hostA[q], deviceA[q], NUM * sizeof(float), hipMemcpyDeviceToHost, streams[q]));
sync_stream(q);
HIP_API_CALL(hipFree(deviceA[q]));
HIP_API_CALL(hipFree(deviceB[q]));
@@ -246,13 +260,21 @@ run(int NUM_QUEUE)
HIP_API_CALL(hipHostFree(hostA[q]));
HIP_API_CALL(hipHostFree(hostB[q]));
HIP_API_CALL(hipHostFree(hostC[q]));
HIP_API_CALL(hipStreamDestroy(streams[q]));
}
HIP_API_CALL(hipDeviceSynchronize());
}
int
main()
{
run(1);
int device_count = 0;
HIP_API_CALL(hipGetDeviceCount(&device_count));
for(int i = 0; i < device_count; ++i)
run(4, i);
return 0;
}
@@ -35,4 +35,4 @@ set_tests_properties(
PASS_REGULAR_EXPRESSION
"Test C tool is using rocprofiler-sdk v([0-9]+\\.[0-9]+\\.[0-9]+)"
FAIL_REGULAR_EXPRESSION
"threw an exception")
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
@@ -6,6 +6,11 @@ include(FetchContent)
set(FETCHCONTENT_BASE_DIR ${PROJECT_BINARY_DIR}/external)
# default FAIL_REGULAR_EXPRESSION for tests
set(ROCPROFILER_DEFAULT_FAIL_REGEX
"threw an exception|Permission denied|Could not create logging file"
CACHE STRING "Default FAIL_REGULAR_EXPRESSION for tests")
# build flags
add_library(rocprofiler-tests-build-flags INTERFACE)
add_library(rocprofiler::tests-build-flags ALIAS rocprofiler-tests-build-flags)
@@ -29,7 +29,7 @@ set_tests_properties(
ENVIRONMENT
"${PRELOAD_ENV};HSA_TOOLS_LIB=$<TARGET_FILE:rocprofiler::rocprofiler-shared-library>;ROCPROFILER_TOOL_OUTPUT_FILE=counter-collection-test.json;ROCPROFILER_TOOL_CONTEXTS=COUNTER_COLLECTION;ROCPROF_COUNTERS=SQ_WAVES_sum"
FAIL_REGULAR_EXPRESSION
"threw an exception")
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
foreach(FILENAME validate.py pytest.ini conftest.py)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}
@@ -44,4 +44,4 @@ set_tests_properties(
test-counter-collection-validate
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS
test-counter-collection-execute FAIL_REGULAR_EXPRESSION
"threw an exception")
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
@@ -34,7 +34,7 @@ set(kernel-tracing-env
set_tests_properties(
test-kernel-tracing-execute
PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT "${kernel-tracing-env}"
FAIL_REGULAR_EXPRESSION "threw an exception")
FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
foreach(FILENAME validate.py pytest.ini conftest.py)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}
@@ -49,4 +49,4 @@ add_test(
set_tests_properties(
test-kernel-tracing-validate
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS test-kernel-tracing-execute
FAIL_REGULAR_EXPRESSION "threw an exception")
FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
@@ -0,0 +1,18 @@
#
# Integration test application libraries
#
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
project(rocprofiler-tests-lib LANGUAGES C CXX)
set(CMAKE_BUILD_RPATH "\$ORIGIN:\$ORIGIN/../lib")
# libraries used by integration test apps which DO NOT link to rocprofiler-sdk-roctx
add_subdirectory(vector-operations)
set(CMAKE_BUILD_RPATH
"\$ORIGIN:\$ORIGIN/../lib:$<TARGET_FILE_DIR:rocprofiler-sdk-roctx::rocprofiler-sdk-roctx-shared-library>"
)
# libraries used by integration test apps which DO link to rocprofiler-sdk-roctx
add_subdirectory(transpose)
@@ -0,0 +1,61 @@
#
#
#
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
if(NOT CMAKE_HIP_COMPILER)
find_program(
amdclangpp_EXECUTABLE
NAMES amdclang++
HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
PATH_SUFFIXES bin llvm/bin NO_CACHE)
mark_as_advanced(amdclangpp_EXECUTABLE)
if(amdclangpp_EXECUTABLE)
set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}")
endif()
endif()
project(rocprofiler-tests-lib-transpose-shared-library LANGUAGES CXX HIP)
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}")
endif()
endforeach()
option(TRANSPOSE_USE_MPI "Enable MPI support in transpose-shared-library exe" OFF)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_HIP_STANDARD 17)
set(CMAKE_HIP_EXTENSIONS OFF)
set(CMAKE_HIP_STANDARD_REQUIRED ON)
set_source_files_properties(transpose.cpp PROPERTIES LANGUAGE HIP)
add_library(transpose-shared-library SHARED)
target_sources(transpose-shared-library PRIVATE transpose.cpp)
target_compile_options(transpose-shared-library PRIVATE -W -Wall -Wextra -Wpedantic
-Wshadow -Werror)
target_include_directories(transpose-shared-library PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
set_target_properties(transpose-shared-library PROPERTIES OUTPUT_NAME transpose)
find_package(Threads REQUIRED)
target_link_libraries(transpose-shared-library PRIVATE Threads::Threads)
find_package(rocprofiler-sdk-roctx REQUIRED)
target_link_libraries(transpose-shared-library
PRIVATE rocprofiler-sdk-roctx::rocprofiler-sdk-roctx)
if(TRANSPOSE_USE_MPI)
find_package(MPI REQUIRED)
target_compile_definitions(transpose-shared-library PRIVATE USE_MPI)
target_link_libraries(transpose-shared-library PRIVATE MPI::MPI_C)
endif()
install(
TARGETS transpose-shared-library
DESTINATION lib
COMPONENT tests)
@@ -0,0 +1,260 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include "hip/hip_runtime.h"
#include "rocprofiler-sdk-roctx/roctx.h"
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <mutex>
#include <random>
#include <stdexcept>
#if defined(USE_MPI)
# include <mpi.h>
#endif
#define HIP_API_CALL(CALL) \
{ \
hipError_t error_ = (CALL); \
if(error_ != hipSuccess) \
{ \
auto _hip_api_print_lk = auto_lock_t{print_lock}; \
fprintf(stderr, \
"%s:%d :: HIP error : %s\n", \
__FILE__, \
__LINE__, \
hipGetErrorString(error_)); \
throw std::runtime_error("hip_api_call"); \
} \
}
namespace
{
using auto_lock_t = std::unique_lock<std::mutex>;
auto print_lock = std::mutex{};
constexpr unsigned shared_mem_tile_dim = 32;
void
check_hip_error(void);
void
verify(int* in, int* out, int M, int N);
__global__ void
transpose(const int* in, int* out, int M, int N);
void
run_transpose_impl(int rank, int tid, int ndevice, size_t nitr, size_t nsync);
__global__ void
transpose(const int* in, int* out, int M, int N)
{
__shared__ int tile[shared_mem_tile_dim][shared_mem_tile_dim];
int idx = (blockIdx.y * blockDim.y + threadIdx.y) * M + blockIdx.x * blockDim.x + threadIdx.x;
tile[threadIdx.y][threadIdx.x] = in[idx];
__syncthreads();
idx = (blockIdx.x * blockDim.x + threadIdx.y) * N + blockIdx.y * blockDim.y + threadIdx.x;
out[idx] = tile[threadIdx.x][threadIdx.y];
}
void
run_transpose_impl(int rank, int tid, int devid, size_t nitr, size_t nsync)
{
roctxRangePush("run_transpose_impl");
constexpr unsigned int M = 4960 * 2;
constexpr unsigned int N = 4960 * 2;
hipStream_t stream = {};
printf("[transpose] Rank %i, thread %i assigned to device %i\n", rank, tid, devid);
HIP_API_CALL(hipSetDevice(devid));
HIP_API_CALL(hipStreamCreate(&stream));
auto_lock_t _lk{print_lock};
std::cout << "[transpose][" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl;
_lk.unlock();
std::default_random_engine _engine{std::random_device{}() * (rank + 1) * (tid + 1)};
std::uniform_int_distribution<int> _dist{0, 1000};
size_t size = sizeof(int) * M * N;
int* inp_matrix = new int[size];
int* out_matrix = new int[size];
for(size_t i = 0; i < M * N; i++)
{
inp_matrix[i] = _dist(_engine);
out_matrix[i] = 0;
}
int* in = nullptr;
int* out = nullptr;
HIP_API_CALL(hipMalloc(&in, size));
HIP_API_CALL(hipMalloc(&out, size));
HIP_API_CALL(hipMemsetAsync(in, 0, size, stream));
HIP_API_CALL(hipMemsetAsync(out, 0, size, stream));
HIP_API_CALL(hipMemcpyAsync(in, inp_matrix, size, hipMemcpyHostToDevice, stream));
HIP_API_CALL(hipStreamSynchronize(stream));
dim3 grid(M / 32, N / 32, 1);
dim3 block(32, 32, 1); // transpose
auto t1 = std::chrono::high_resolution_clock::now();
for(size_t i = 0; i < nitr; ++i)
{
transpose<<<grid, block, 0, stream>>>(in, out, M, N);
check_hip_error();
if(i % nsync == (nsync - 1)) HIP_API_CALL(hipStreamSynchronize(stream));
}
auto t2 = std::chrono::high_resolution_clock::now();
HIP_API_CALL(hipStreamSynchronize(stream));
HIP_API_CALL(hipMemcpyAsync(out_matrix, out, size, hipMemcpyDeviceToHost, stream));
double time = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count();
float GB = (float) size * nitr * 2 / (1 << 30);
print_lock.lock();
std::cout << "[transpose][" << rank << "][" << tid << "] Runtime of transpose is " << time
<< " sec\n";
std::cout << "[transpose][" << rank << "][" << tid
<< "] The average performance of transpose is " << GB / time << " GBytes/sec"
<< std::endl;
print_lock.unlock();
HIP_API_CALL(hipStreamSynchronize(stream));
HIP_API_CALL(hipStreamDestroy(stream));
// cpu_transpose(matrix, out_matrix, M, N);
verify(inp_matrix, out_matrix, M, N);
HIP_API_CALL(hipFree(in));
HIP_API_CALL(hipFree(out));
delete[] inp_matrix;
delete[] out_matrix;
roctxRangePop();
}
void
check_hip_error(void)
{
hipError_t err = hipGetLastError();
if(err != hipSuccess)
{
auto_lock_t _lk{print_lock};
std::cerr << "Error: " << hipGetErrorString(err) << std::endl;
throw std::runtime_error("hip_api_call");
}
}
void
verify(int* in, int* out, int M, int N)
{
for(int i = 0; i < 10; i++)
{
int row = rand() % M;
int col = rand() % N;
if(in[row * N + col] != out[col * M + row])
{
auto_lock_t _lk{print_lock};
std::cout << "mismatch: " << row << ", " << col << " : " << in[row * N + col] << " | "
<< out[col * M + row] << "\n";
}
}
}
} // namespace
void
run_transpose(size_t nthreads, size_t nitr, size_t nsync)
{
auto range_id = roctxRangeStart("run_transpose");
int rank = 0;
int size = 1;
printf("[transpose] Number of threads: %zu\n", nthreads);
printf("[transpose] Number of iterations: %zu\n", nitr);
printf("[transpose] Syncing every %zu iterations\n", nsync);
#if defined(USE_MPI)
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
#else
(void) size;
#endif
// this is a temporary workaround in omnitrace when HIP + MPI is enabled
int ndevice = 0;
HIP_API_CALL(hipGetDeviceCount(&ndevice));
printf("[transpose] Number of devices found: %i\n", ndevice);
auto devids = std::vector<int>{};
devids.resize(size * nthreads, 0);
int devid = 0;
for(size_t i = 0; i < nthreads; ++i)
{
for(int j = 0; j < size; ++j)
{
auto idx = (j * nthreads) + i;
devids.at(idx) = devid++ % ndevice;
}
}
auto devid_offset = (rank * nthreads);
auto _threads = std::vector<std::thread>{};
for(size_t i = 1; i < nthreads; ++i)
_threads.emplace_back(
run_transpose_impl, rank, i, devids.at(devid_offset + i), nitr, nsync);
run_transpose_impl(rank, 0, devids.at(devid_offset + 0), nitr, nsync);
for(auto& itr : _threads)
itr.join();
#if defined(USE_MPI)
MPI_Barrier(MPI_COMM_WORLD);
#endif
// for(int i = 0; i < ndevice; ++i)
// {
// HIP_API_CALL(hipSetDevice(i));
// HIP_API_CALL(hipDeviceSynchronize());
// }
// #if defined(USE_MPI)
// MPI_Barrier(MPI_COMM_WORLD);
// #endif
// if(rank == 0)
// {
// for(int i = 0; i < ndevice; ++i)
// {
// HIP_API_CALL(hipSetDevice(i));
// HIP_API_CALL(hipDeviceReset());
// }
// }
// #if defined(USE_MPI)
// MPI_Barrier(MPI_COMM_WORLD);
// #endif
roctxRangeStop(range_id);
}
@@ -0,0 +1,28 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#pragma once
#include <cstddef>
void
run_transpose(size_t nthreads, size_t nitr, size_t nsync);
@@ -0,0 +1,49 @@
#
#
#
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
if(NOT CMAKE_HIP_COMPILER)
find_program(
amdclangpp_EXECUTABLE
NAMES amdclang++
HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
PATH_SUFFIXES bin llvm/bin NO_CACHE)
mark_as_advanced(amdclangpp_EXECUTABLE)
if(amdclangpp_EXECUTABLE)
set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}")
endif()
endif()
project(rocprofiler-tests-lib-vector-operations LANGUAGES CXX HIP)
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}")
endif()
endforeach()
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_HIP_STANDARD 17)
set(CMAKE_HIP_EXTENSIONS OFF)
set(CMAKE_HIP_STANDARD_REQUIRED ON)
set_source_files_properties(vector-ops.cpp PROPERTIES LANGUAGE HIP)
add_library(vector-ops-shared-library SHARED)
target_sources(vector-ops-shared-library PRIVATE vector-ops.cpp)
target_compile_options(vector-ops-shared-library PRIVATE -W -Wall -Wextra -Wpedantic
-Wshadow -Werror)
target_include_directories(vector-ops-shared-library PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
set_target_properties(vector-ops-shared-library PROPERTIES OUTPUT_NAME vector-ops)
find_package(Threads REQUIRED)
target_link_libraries(vector-ops-shared-library PRIVATE Threads::Threads)
install(
TARGETS vector-ops-shared-library
DESTINATION lib
COMPONENT tests)
@@ -0,0 +1,291 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include <assert.h>
#include <hip/hip_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <algorithm>
#include <iostream>
#include <mutex>
#include <vector>
#define HIP_API_CALL(CALL) \
{ \
hipError_t error_ = (CALL); \
if(error_ != hipSuccess) \
{ \
auto _hip_api_print_lk = auto_lock_t{print_lock}; \
fprintf(stderr, \
"%s:%d :: HIP error : %s\n", \
__FILE__, \
__LINE__, \
hipGetErrorString(error_)); \
throw std::runtime_error("hip_api_call"); \
} \
}
namespace
{
using auto_lock_t = std::unique_lock<std::mutex>;
auto print_lock = std::mutex{};
constexpr auto WIDTH = (1 << 12); // 4096
constexpr auto HEIGHT = (1 << 11); // 2048
constexpr auto DEPTH = (1 << 0); // 1
constexpr auto NUM = (WIDTH * HEIGHT * DEPTH);
struct dimensions
{
int x = 1;
int y = 1;
int z = 1;
};
constexpr auto threads_per_block = dimensions{64, 1, 1};
// Computes vectorAdd with matrix-multiply
template <typename Tp>
__global__ void
addition_kernel(Tp* __restrict__ a,
const Tp* __restrict__ b,
const Tp* __restrict__ c,
int width,
int /*height*/)
{
// printf("addition kernel\n");
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if(x >= WIDTH || y >= HEIGHT) return;
int index = y * width + x;
a[index] = b[index] + c[index];
}
template <typename Tp>
__global__ void
subtract_kernel(Tp* __restrict__ a,
const Tp* __restrict__ b,
const Tp* __restrict__ c,
int width,
int /*height*/)
{
// printf("subtract kernel\n");
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if(x >= WIDTH || y >= HEIGHT) return;
int index = y * width + x;
a[index] = abs(b[index] - c[index]);
}
template <typename Tp>
__global__ void
multiply_kernel(Tp* __restrict__ a,
const Tp* __restrict__ b,
const Tp* __restrict__ c,
int width,
int /*height*/)
{
// printf("multiply kernel\n");
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if(x >= WIDTH || y >= HEIGHT) return;
int index = y * width + x;
a[index] = (b[index] - 1) * (c[index] - 1) + 1;
}
template <typename Tp>
__global__ void
divide_kernel(Tp* __restrict__ a,
const Tp* __restrict__ b,
const Tp* __restrict__ c,
int width,
int /*height*/)
{
// printf("divide kernel\n");
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if(x >= WIDTH || y >= HEIGHT) return;
int index = y * width + x;
a[index] = (b[index] - c[index]) / abs(c[index] + b[index]) + 1;
}
void
run_vector_ops_impl(int num_queue, int device_id)
{
auto t1 = std::chrono::high_resolution_clock::now();
HIP_API_CALL(hipSetDevice(device_id));
std::vector<float*> hostA(num_queue);
std::vector<float*> hostB(num_queue);
std::vector<float*> hostC(num_queue);
std::vector<float*> deviceA(num_queue);
std::vector<float*> deviceB(num_queue);
std::vector<float*> deviceC(num_queue);
std::vector<hipStream_t> streams(num_queue);
auto sync_stream = [num_queue, streams](int q) {
if(q < 0 || q >= num_queue)
throw std::runtime_error{std::string{"invalid stream id: "} + std::to_string(q)};
HIP_API_CALL(hipStreamSynchronize(streams.at(q)));
};
auto sync_streams = [num_queue, sync_stream]() {
for(int i = 0; i < num_queue; ++i)
sync_stream(i);
};
for(int q = 0; q < num_queue; q++)
{
HIP_API_CALL(hipStreamCreateWithFlags(&streams[q], hipStreamNonBlocking));
HIP_API_CALL(hipHostMalloc(&hostA[q], NUM * sizeof(float), 0));
HIP_API_CALL(hipHostMalloc(&hostB[q], NUM * sizeof(float), 0));
HIP_API_CALL(hipHostMalloc(&hostC[q], NUM * sizeof(float), 0));
// initialize the input data
for(int i = 0; i < NUM; i++)
{
hostB[q][i] = static_cast<float>(i);
hostC[q][i] = static_cast<float>(i * 100.0f);
}
HIP_API_CALL(hipMallocAsync(&deviceA[q], NUM * sizeof(float), streams[q]));
HIP_API_CALL(hipMallocAsync(&deviceB[q], NUM * sizeof(float), streams[q]));
HIP_API_CALL(hipMallocAsync(&deviceC[q], NUM * sizeof(float), streams[q]));
HIP_API_CALL(hipMemcpyAsync(
deviceB[q], hostB[q], NUM * sizeof(float), hipMemcpyHostToDevice, streams[q]));
HIP_API_CALL(hipMemcpyAsync(
deviceC[q], hostC[q], NUM * sizeof(float), hipMemcpyHostToDevice, streams[q]));
}
sync_streams();
for(int q = 0; q < num_queue; q++)
{
hipLaunchKernelGGL(addition_kernel,
dim3(WIDTH / threads_per_block.x, HEIGHT / threads_per_block.y),
dim3(threads_per_block.x, threads_per_block.y),
0,
streams[q],
deviceA[q],
deviceB[q],
deviceC[q],
WIDTH,
HEIGHT);
hipLaunchKernelGGL(subtract_kernel,
dim3(WIDTH / threads_per_block.x, HEIGHT / threads_per_block.y),
dim3(threads_per_block.x, threads_per_block.y),
0,
streams[q],
deviceA[q],
deviceB[q],
deviceC[q],
WIDTH,
HEIGHT);
hipLaunchKernelGGL(multiply_kernel,
dim3(WIDTH / threads_per_block.x, HEIGHT / threads_per_block.y),
dim3(threads_per_block.x, threads_per_block.y),
0,
streams[q],
deviceA[q],
deviceB[q],
deviceC[q],
WIDTH,
HEIGHT);
hipLaunchKernelGGL(divide_kernel,
dim3(WIDTH / threads_per_block.x, HEIGHT / threads_per_block.y),
dim3(threads_per_block.x, threads_per_block.y),
0,
streams[q],
deviceB[q],
deviceA[q],
deviceC[q],
WIDTH,
HEIGHT);
}
sync_streams();
for(int q = 0; q < num_queue; q++)
{
HIP_API_CALL(hipMemcpyAsync(
hostA[q], deviceA[q], NUM * sizeof(float), hipMemcpyDeviceToHost, streams[q]));
sync_stream(q);
HIP_API_CALL(hipFree(deviceA[q]));
HIP_API_CALL(hipFree(deviceB[q]));
HIP_API_CALL(hipFree(deviceC[q]));
HIP_API_CALL(hipHostFree(hostA[q]));
HIP_API_CALL(hipHostFree(hostB[q]));
HIP_API_CALL(hipHostFree(hostC[q]));
HIP_API_CALL(hipStreamDestroy(streams[q]));
}
auto t2 = std::chrono::high_resolution_clock::now();
double time = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count();
print_lock.lock();
std::cout << "[vector-ops] Runtime of vector-ops is " << time << " sec\n";
print_lock.unlock();
}
} // namespace
void
run_vector_ops(int num_threads, int num_queue)
{
int device_count = 0;
HIP_API_CALL(hipGetDeviceCount(&device_count));
if(device_count == 0) throw std::runtime_error{"No HIP devices found"};
num_threads = std::max<int>(num_threads, 1);
num_queue = std::max<int>(num_queue, 1);
auto _threads = std::vector<std::thread>{};
_threads.reserve(num_threads);
for(int i = 0; i < num_threads; ++i)
_threads.emplace_back(run_vector_ops_impl, num_queue, i % device_count);
for(auto& itr : _threads)
itr.join();
}
@@ -0,0 +1,26 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#pragma once
void
run_vector_ops(int num_threads, int num_queue);
@@ -24,5 +24,6 @@ enable_testing()
include(CTest)
add_subdirectory(tracing)
add_subdirectory(counter-collection)
add_subdirectory(tracing-plus-cc)
add_subdirectory(tracing-hip-in-libraries)
add_subdirectory(counter-collection)
@@ -1,3 +1,7 @@
#
# Various counter collection tests
#
foreach(FILENAME conftest.py pytest.ini)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}
${CMAKE_CURRENT_BINARY_DIR}/${FILENAME} COPYONLY)
@@ -40,7 +40,7 @@ set(cc-env-pmc1
set_tests_properties(
rocprofv3-test-counter-collection-pmc1-execute
PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT "${cc-env-pmc1}"
FAIL_REGULAR_EXPRESSION "threw an exception")
FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
add_test(NAME rocprofv3-test-counter-collection-pmc1-validate
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --input
@@ -50,4 +50,4 @@ set_tests_properties(
rocprofv3-test-counter-collection-pmc1-validate
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS
rocprofv3-test-counter-collection-pmc1-execute FAIL_REGULAR_EXPRESSION
"threw an exception")
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
@@ -1,30 +1,46 @@
import pandas as pd
#!/usr/bin/env python3
import sys
import pytest
import numpy as np
import pandas as pd
kernel_list = ["addition_kernel", "subtract_kernel", "multiply_kernel", "divide_kernel"]
kernel_list = sorted(
["addition_kernel", "subtract_kernel", "multiply_kernel", "divide_kernel"]
)
def unique(lst):
return list(set(lst))
def test_validate_counter_collection_pmc1(input_data: pd.DataFrame):
df = input_data
assert df.empty == False
assert not df.empty
assert (df["Agent_Id"].astype(int).values > 0).all()
assert (df["Queue_Id"].astype(int).values > 0).all()
assert (df["Process_Id"].astype(int).values > 0).all()
assert len(df["Kernel-Name"]) > 0
df_list = df["Kernel-Name"].values.flatten().tolist()
# Check if each string in kernel_list is present at least once
missing_kernels = []
for kernel in kernel_list:
if kernel not in df_list:
missing_kernels.append(kernel)
assert len(df["Kernel_Name"]) > 0
assert kernel_list == sorted(df["Kernel_Name"].unique().tolist())
kernel_count = dict([[itr, 0] for itr in kernel_list])
assert len(kernel_count) == len(kernel_list)
for itr in df["Kernel_Name"]:
kernel_count[itr] += 1
kn_cnt = [itr for _, itr in kernel_count.items()]
assert min(kn_cnt) == max(kn_cnt) and len(unique(kn_cnt)) == 1
assert (
not missing_kernels
), f"The following kernel names are missing from the out file: {missing_kernels}"
assert df["Counter_Name"].str.contains("SQ_WAVES").all()
assert len(df["Counter_Value"]) > 0
assert df["Counter_Name"].str.contains("SQ_WAVES").all()
assert (df["Counter_Value"].astype(int).values > 0).all()
di_list = df["Dispatch_Id"].astype(int).values.tolist()
di_uniq = sorted(df["Dispatch_Id"].unique().tolist())
# make sure the dispatch ids are unique and ordered
di_expect = [idx + 1 for idx in range(len(di_list))]
assert di_expect == di_uniq
if __name__ == "__main__":
@@ -40,7 +40,7 @@ set(cc-env-pmc2
set_tests_properties(
rocprofv3-test-counter-collection-pmc2-execute
PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT "${cc-env-pmc2}"
FAIL_REGULAR_EXPRESSION "threw an exception")
FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
add_test(NAME rocprofv3-test-counter-collection-pmc2-validate
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py
@@ -50,4 +50,4 @@ set_tests_properties(
rocprofv3-test-counter-collection-pmc2-validate
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS
rocprofv3-test-counter-collection-pmc2-execute FAIL_REGULAR_EXPRESSION
"threw an exception")
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
@@ -34,7 +34,7 @@ def test_validate_counter_collection_pmc2(input_dir: pd.DataFrame):
with open(file_path, "r") as file:
df = pd.read_csv(file)
# check if kernel-name is present
assert len(df["Kernel-Name"]) > 0
assert len(df["Kernel_Name"]) > 0
# check if counter value is positive
assert len(df["Counter_Value"]) > 0
@@ -41,7 +41,7 @@ set(cc-env-list-metrics
set_tests_properties(
rocprofv3-test-list-metrics-execute
PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT "${cc-env-list-metrics}"
FAIL_REGULAR_EXPRESSION "threw an exception")
FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
set_tests_properties(
rocprofv3-test-list-metrics-std-out-execute
@@ -76,6 +76,6 @@ set_tests_properties(
DEPENDS
rocprofv3-test-list-metrics-execute
FAIL_REGULAR_EXPRESSION
"threw an exception"
"${ROCPROFILER_DEFAULT_FAIL_REGEX}"
ATTACHED_FILES_ON_FAIL
"${VALIDATION_FILES}")
@@ -3,3 +3,4 @@
addopts = --durations=20 -rA -s -vv
testpaths = input1/validate.py
input2/validate.py
list_metrics/validate.py
@@ -0,0 +1,72 @@
#
# rocprofv3 tool test
#
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
project(
rocprofiler-tests-rocprofv3-tracing-hip-in-libraries
LANGUAGES CXX
VERSION 0.0.0)
find_package(rocprofiler-sdk REQUIRED)
add_test(
NAME rocprofv3-test-trace-hip-in-libraries-execute
COMMAND
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> --hip-runtime-trace
--hip-compiler-trace --hsa-core-trace --hsa-amd-trace --hsa-image-trace
--hsa-finalizer-trace --kernel-trace --memory-copy-trace -d
${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace -o out $<TARGET_FILE:hip-in-libraries>)
string(REPLACE "LD_PRELOAD=" "ROCPROF_PRELOAD=" PRELOAD_ENV
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV}")
set(tracing-env "${PRELOAD_ENV}"
"HSA_TOOLS_LIB=$<TARGET_FILE:rocprofiler::rocprofiler-shared-library>")
set_tests_properties(
rocprofv3-test-trace-hip-in-libraries-execute
PROPERTIES
LABELS
"integration-tests"
ENVIRONMENT
"${tracing-env}"
FAIL_REGULAR_EXPRESSION
"HSA_CORE_API|HSA_AMD_EXT_API|HSA_IMAGE_EXT_API|HSA_FINALIZER_EXT_API|HIP_API|HIP_COMPILER_API|KERNEL_DISPATCH|CODE_OBJECT"
)
foreach(FILENAME validate.py conftest.py)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}
${CMAKE_CURRENT_BINARY_DIR}/${FILENAME} COPYONLY)
endforeach()
add_test(
NAME rocprofv3-test-trace-hip-in-libraries-validate
COMMAND
${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --hsa-input
${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_hsa_api_trace.csv
--hip-input
${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_hip_api_trace.csv
--kernel-input
${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_kernel_trace.csv
--memory-copy-input
${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_memory_copy_trace.csv)
set(VALIDATION_FILES
${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_memory_copy_trace.csv
${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_hsa_api_trace.csv
${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_hip_api_trace.csv
${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_kernel_trace.csv)
set_tests_properties(
rocprofv3-test-trace-hip-in-libraries-validate
PROPERTIES TIMEOUT
45
LABELS
"integration-tests"
DEPENDS
rocprofv3-test-trace-hip-in-libraries-execute
FAIL_REGULAR_EXPRESSION
"AssertionError"
ATTACHED_FILES_ON_FAIL
"${VALIDATION_FILES}")
@@ -0,0 +1,94 @@
#!/usr/bin/env python3
import os
import csv
import pytest
def pytest_addoption(parser):
parser.addoption(
"--hsa-input",
action="store",
help="Path to HSA API tracing CSV file.",
)
parser.addoption(
"--kernel-input",
action="store",
help="Path to kernel tracing CSV file.",
)
parser.addoption(
"--memory-copy-input",
action="store",
help="Path to memory-copy tracing CSV file.",
)
parser.addoption(
"--marker-input",
action="store",
help="Path to marker API tracing CSV file.",
)
parser.addoption(
"--hip-input",
action="store",
help="Path to HIP runtime and compiler API tracing CSV file.",
)
@pytest.fixture
def hsa_input_data(request):
filename = request.config.getoption("--hsa-input")
data = []
with open(filename, "r") as inp:
reader = csv.DictReader(inp)
for row in reader:
data.append(row)
return data
@pytest.fixture
def kernel_input_data(request):
filename = request.config.getoption("--kernel-input")
data = []
with open(filename, "r") as inp:
reader = csv.DictReader(inp)
for row in reader:
data.append(row)
return data
@pytest.fixture
def memory_copy_input_data(request):
filename = request.config.getoption("--memory-copy-input")
data = []
with open(filename, "r") as inp:
reader = csv.DictReader(inp)
for row in reader:
data.append(row)
return data
@pytest.fixture
def marker_input_data(request):
filename = request.config.getoption("--marker-input")
data = []
with open(filename, "r") as inp:
reader = csv.DictReader(inp)
for row in reader:
data.append(row)
return data
@pytest.fixture
def hip_input_data(request):
filename = request.config.getoption("--hip-input")
data = []
if os.path.exists(filename):
with open(filename, "r") as inp:
reader = csv.DictReader(inp)
for row in reader:
data.append(row)
return data
@@ -0,0 +1,142 @@
#!/usr/bin/env python3
import sys
import pytest
class dim3(object):
def __init__(self, x, y, z):
self.x = int(x)
self.y = int(y)
self.z = int(z)
def as_tuple(self):
return (self.x, self.y, self.z)
def test_api_trace(hsa_input_data, hip_input_data):
functions = []
correlation_ids = []
for row in hsa_input_data:
assert row["Domain"] in (
"HSA_CORE_API",
"HSA_AMD_EXT_API",
"HSA_IMAGE_EXT_API",
"HSA_FINALIZE_EXT_API",
)
assert int(row["Process_Id"]) > 0
assert int(row["Thread_Id"]) >= int(row["Process_Id"])
assert int(row["End_Timestamp"]) >= int(row["Start_Timestamp"])
functions.append(row["Function"])
correlation_ids.append(int(row["Correlation_Id"]))
for row in hip_input_data:
assert row["Domain"] in [
"HIP_RUNTIME_API",
"HIP_COMPILER_API",
]
assert int(row["Process_Id"]) > 0
assert int(row["Thread_Id"]) == 0 or int(row["Thread_Id"]) >= int(
row["Process_Id"]
)
assert int(row["End_Timestamp"]) >= int(row["Start_Timestamp"])
functions.append(row["Function"])
correlation_ids.append(int(row["Correlation_Id"]))
correlation_ids = sorted(list(set(correlation_ids)))
# all correlation ids are unique
assert len(correlation_ids) == (len(hsa_input_data) + len(hip_input_data))
# correlation ids are numbered from 1 to N
assert correlation_ids[0] == 1
assert correlation_ids[-1] == len(correlation_ids)
functions = list(set(functions))
for itr in (
"hsa_amd_memory_async_copy_on_engine",
"hsa_agent_get_info",
"hsa_agent_iterate_isas",
"hsa_signal_create",
"hsa_agent_get_info",
"hsa_executable_symbol_get_info",
):
assert itr in functions
if hip_input_data:
for itr in (
"hipGetLastError",
"hipLaunchKernel",
"hipStreamSynchronize",
"hipMemcpyAsync",
"hipFree",
"hipStreamDestroy",
"hipDeviceSynchronize",
"hipDeviceReset",
"hipSetDevice",
):
assert itr in functions
def test_kernel_trace(kernel_input_data):
valid_kernel_names = sorted(
[
"__amd_rocclr_fillBufferAligned",
"(anonymous namespace)::transpose(int const*, int*, int, int)",
"void (anonymous namespace)::addition_kernel<float>(float*, float const*, float const*, int, int)",
"void (anonymous namespace)::divide_kernel<float>(float*, float const*, float const*, int, int)",
"void (anonymous namespace)::multiply_kernel<float>(float*, float const*, float const*, int, int)",
"void (anonymous namespace)::subtract_kernel<float>(float*, float const*, float const*, int, int)",
]
)
kernels = []
for row in kernel_input_data:
kernel_name = row["Kernel_Name"]
assert row["Kind"] == "KERNEL_DISPATCH"
assert int(row["Agent_Id"]) > 0
assert int(row["Queue_Id"]) > 0
assert int(row["Kernel_Id"]) > 0
assert int(row["Correlation_Id"]) > 0
assert kernel_name in valid_kernel_names
if kernel_name not in kernels:
kernels.append(kernel_name)
workgrp_size = dim3(
row["Workgroup_Size_X"], row["Workgroup_Size_Y"], row["Workgroup_Size_Z"]
)
grid_size = dim3(row["Grid_Size_X"], row["Grid_Size_Y"], row["Grid_Size_Z"])
if kernel_name == "__amd_rocclr_fillBufferAligned":
assert workgrp_size.as_tuple() > (1, 1, 1)
assert grid_size.as_tuple() > (1, 1, 1)
elif "transpose" in kernel_name:
assert workgrp_size.as_tuple() == (32, 32, 1)
assert grid_size.as_tuple() == (9920, 9920, 1)
else:
assert workgrp_size.as_tuple() == (64, 1, 1)
assert grid_size.as_tuple() == (4096, 2048, 1)
assert int(row["End_Timestamp"]) >= int(row["Start_Timestamp"])
kernels = sorted(list(set(kernels)))
assert kernels == valid_kernel_names
def test_memory_copy_trace(memory_copy_input_data):
for row in memory_copy_input_data:
assert row["Kind"] == "MEMORY_COPY"
assert row["Direction"] in ("HOST_TO_DEVICE", "DEVICE_TO_HOST")
if row["Direction"] == "HOST_TO_DEVICE":
assert int(row["Source_Agent_Id"]) == 0
elif row["Direction"] == "DEVICE_TO_HOST":
assert int(row["Destination_Agent_Id"]) == 0
assert int(row["Correlation_Id"]) > 0
assert int(row["End_Timestamp"]) >= int(row["Start_Timestamp"])
assert len(memory_copy_input_data) == 120
if __name__ == "__main__":
exit_code = pytest.main(["-x", __file__] + sys.argv[1:])
sys.exit(exit_code)
@@ -40,7 +40,7 @@ set(cc-tracing-env
set_tests_properties(
rocprofv3-test-tracing-plus-cc-execute
PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT "${cc-tracing-env}"
FAIL_REGULAR_EXPRESSION "threw an exception")
FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
add_test(NAME rocprofv3-test-tracing-plus-cc-validate
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py
@@ -50,4 +50,4 @@ set_tests_properties(
rocprofv3-test-tracing-plus-cc-validate
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS
rocprofv3-test-tracing-plus-cc-execute FAIL_REGULAR_EXPRESSION
"threw an exception")
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
@@ -34,7 +34,7 @@ def test_validate_counter_collection_plus_tracing(input_dir: pd.DataFrame):
with open(file_path, "r") as file:
df = pd.read_csv(file)
# check if either kernel-name/FUNCTION is present
assert "Kernel-Name" in df.columns or "Function" in df.columns
assert "Kernel_Name" in df.columns or "Function" in df.columns
if __name__ == "__main__":
@@ -23,7 +23,7 @@ def test_hsa_api_trace(hsa_input_data):
correlation_ids = sorted(list(set(correlation_ids)))
hsa_api_calls_offset = 2 # roctxRangePush is first
num_marker_api_calls = 6 # seven marker API calls, only six entries in
num_marker_api_calls = 7 # seven marker API calls, only six entries in
# marker csv data because roctxRangePush + roctxRangePop is one entry
# all correlation ids are unique
@@ -526,19 +526,16 @@ counter_collection_buffered(rocprofiler_context_id_t, /*context*/
}
void
dispatch_callback(rocprofiler_queue_id_t, /*queue_id*/
const rocprofiler_agent_t* agent,
rocprofiler_correlation_id_t, /*correlation_id*/
const hsa_kernel_dispatch_packet_t*, /*dispatch_packet*/
uint64_t, /*kernel_id*/
void* /*callback_data_args*/,
rocprofiler_profile_config_id_t* config)
dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
rocprofiler_profile_config_id_t* config,
rocprofiler_user_data_t* /*user_data*/,
void* /*callback_data_args*/)
{
static std::shared_mutex m_mutex = {};
static std::unordered_map<uint64_t, rocprofiler_profile_config_id_t> profile_cache = {};
auto search_cache = [&]() {
if(auto pos = profile_cache.find(agent->id.handle); pos != profile_cache.end())
if(auto pos = profile_cache.find(dispatch_data.agent_id.handle); pos != profile_cache.end())
{
*config = pos->second;
return true;
@@ -565,7 +562,7 @@ dispatch_callback(rocprofiler_queue_id_t, /*queue_id*/
// Iterate through the agents and get the counters available on that agent
ROCPROFILER_CALL(rocprofiler_iterate_agent_supported_counters(
agent->id,
dispatch_data.agent_id,
[]([[maybe_unused]] rocprofiler_agent_id_t id,
rocprofiler_counter_id_t* counters,
size_t num_counters,
@@ -600,11 +597,12 @@ dispatch_callback(rocprofiler_queue_id_t, /*queue_id*/
// Create a colleciton profile for the counters
rocprofiler_profile_config_id_t profile;
ROCPROFILER_CALL(rocprofiler_create_profile_config(
agent->id, collect_counters.data(), collect_counters.size(), &profile),
"Could not construct profile cfg");
ROCPROFILER_CALL(
rocprofiler_create_profile_config(
dispatch_data.agent_id, collect_counters.data(), collect_counters.size(), &profile),
"Could not construct profile cfg");
profile_cache.emplace(agent->id.handle, profile);
profile_cache.emplace(dispatch_data.agent_id.handle, profile);
// Return the profile to collect those counters for this dispatch
*config = profile;
}