Shared Library Constructor (rocprofv3 deadlock fix) (#599)
* Moved tests/apps to tests/bin
* Renamed cmake project in tests/bin
* Update samples
- Use ROCPROFILER_DEFAULT_FAIL_REGEX
- tweaks to stdout messages
* Update tests
- Use ROCPROFILER_DEFAULT_FAIL_REGEX
* Add tests/lib
- libraries with HIP code
* Update PTL submodule
- remove atexit delete of thread_id_map
* Update cmake/rocprofiler_options.cmake
- Set ROCPROFILER_DEFAULT_FAIL_REGEX
* Update common lib: env + logging
- improved customization of logging settings
- default to disabling logging to files
- install failure handler for rocprofv3
- set_env support in environment.*
* Add lib/rocprofiler-sdk/shared_library.cpp
- shared library constructor
* Update lib/rocprofiler-sdk-tool/tool.cpp
- destructor thread safety
- convert callback_name_info and buffered_name_info to pointers
- install failure handler for logging
* Add tests/bin/hip-in-libraries
- hip-in-libraries is an exe which uses two shared libraries where each shared library contains HIP kernels
- used for testing deadlocking within __hipRegisterFatBinary
* Update bin/rocprofv3
- reorganized the env variables
- use exec to launch command
- set ROCPROFILER_LIBRARY_CTOR=1
* Add tests/rocprofv3/tracing-hip-in-libraries
- uses hip-in-libraries exe for exe which uses shared libraries to launch HIP kernels
* Update bin/rocprofv3
- fix counter collection (no exec)
* Update lib/rocprofiler-sdk-tool/tool.cpp
- replace "Kernel-Name" with "Kernel_Name"
* Update lib/rocprofiler-sdk/registration.cpp
Use RTLD_LOCAL instead of RTLD_GLOBAL for env libraries
* Update tests/rocprofv3
- replace "Kernel-Name" with "Kernel_Name"
* Update tests
- vector-ops (bin) stream syncs + runs with 4 queues per device
- improve counter-collection/input1 validation
- rocprofv3/tracing-hip-in-libraries does not do sys-trace
- improved validation script for tracing-hip-in-libraries
- updated dispatch_callback in json-tool.cpp following reworking of prototypes for counter collection
* Update samples/counter_collection
- updated dispatch_callback(s) and record_callback(s) following reworking of prototypes
* Update bin/rocprofv3
- reorganized help menu
- added options for sub-HSA tables
- added --hip-runtime-trace
- changed --hip-trace to include --hip-compiler-trace
* Update lib/rocprofiler-sdk-tool
- improved kernel filtering
- removed arch_vgpr, accum_vgpr, sgpr code (in rocprofiler-sdk)
- fixed issue with counter-collection w/o tracing
- added support for fine grained HSA API tracing
- removed directly linking to HSA-runtime
* Update lib/rocprofiler-sdk/agent.cpp
- rocp_agents != hsa_agents is non-fatal when ROCPROFILER_BUILD_CI=OFF (CMake option)
* GPR (vector and scalar) info in kernel symbol data
- rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t contains general purpose register info
* Header include order fix
- Include repo headers first
- Third party library headers next
- standard library headers last
* Update dispatch profiling public API
- introduce rocprofiler_profile_counting_dispatch_data_t
- change signature of rocprofiler_profile_counting_dispatch_callback_t and rocprofiler_profile_counting_record_callback_t
- provide rocprofiler_user_data_t pointer in dispatch callback
- provide rocprofiler_user_data_t value (from dispatch cb) in record callback
* Update tests/bin/CMakeLists.txt
- fix add_subdirectory(hip-in-libraries) order
* Update VERSION
- bump to 0.2.0 in prep for AFAR
[ROCm/rocprofiler-sdk commit: 7b6d3c70bd]
This commit is contained in:
committed by
GitHub
parent
1295c42022
commit
407fc57ede
@@ -1 +1 @@
|
||||
0.1.0
|
||||
0.2.0
|
||||
|
||||
@@ -131,3 +131,8 @@ if(ASAN)
|
||||
endif()
|
||||
|
||||
include(rocprofiler_memcheck)
|
||||
|
||||
# default FAIL_REGULAR_EXPRESSION for tests
|
||||
set(ROCPROFILER_DEFAULT_FAIL_REGEX
|
||||
"threw an exception|Permission denied|Could not create logging file"
|
||||
CACHE STRING "Default FAIL_REGULAR_EXPRESSION for tests")
|
||||
|
||||
+1
-1
Submodule projects/rocprofiler-sdk/external/ptl updated: 12ca26ac2b...48df416254
@@ -55,4 +55,4 @@ set_tests_properties(
|
||||
ENVIRONMENT
|
||||
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV};HSA_TOOLS_LIB=$<TARGET_FILE:rocprofiler::rocprofiler-shared-library>"
|
||||
FAIL_REGULAR_EXPRESSION
|
||||
"threw an exception")
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
|
||||
|
||||
@@ -155,7 +155,7 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv)
|
||||
if(argc > 3) nsync = atoll(argv[3]);
|
||||
|
||||
auto_lock_t _lk{print_lock};
|
||||
std::cout << "[" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl;
|
||||
std::cout << "[transpose][" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl;
|
||||
_lk.unlock();
|
||||
|
||||
std::default_random_engine _engine{std::random_device{}() * (rank + 1) * (tid + 1)};
|
||||
@@ -183,7 +183,7 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv)
|
||||
dim3 block(32, 32, 1); // transpose_a
|
||||
|
||||
print_lock.lock();
|
||||
printf("[%i][%i] grid=(%i,%i,%i), block=(%i,%i,%i)\n",
|
||||
printf("[transpose][%i][%i] grid=(%i,%i,%i), block=(%i,%i,%i)\n",
|
||||
rank,
|
||||
tid,
|
||||
grid.x,
|
||||
@@ -208,8 +208,10 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv)
|
||||
float GB = (float) size * nitr * 2 / (1 << 30);
|
||||
|
||||
print_lock.lock();
|
||||
std::cout << "[" << rank << "][" << tid << "] Runtime of transpose is " << time << " sec\n"
|
||||
<< "The average performance of transpose is " << GB / time << " GBytes/sec"
|
||||
std::cout << "[transpose][" << rank << "][" << tid << "] Runtime of transpose is " << time
|
||||
<< " sec\n";
|
||||
std::cout << "[transpose][" << rank << "][" << tid
|
||||
<< "] The average performance of transpose is " << GB / time << " GBytes/sec"
|
||||
<< std::endl;
|
||||
print_lock.unlock();
|
||||
|
||||
|
||||
@@ -57,4 +57,4 @@ set(callback-api-tracing-env
|
||||
set_tests_properties(
|
||||
callback-api-tracing
|
||||
PROPERTIES TIMEOUT 45 LABELS "samples" ENVIRONMENT "${callback-api-tracing-env}"
|
||||
FAIL_REGULAR_EXPRESSION "threw an exception")
|
||||
FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
|
||||
|
||||
@@ -177,7 +177,7 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv)
|
||||
if(argc > 3) nsync = atoll(argv[3]);
|
||||
|
||||
auto_lock_t _lk{print_lock};
|
||||
std::cout << "[" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl;
|
||||
std::cout << "[transpose][" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl;
|
||||
_lk.unlock();
|
||||
|
||||
auto _seed = std::random_device{}() * (rank + 1) * (tid + 1);
|
||||
@@ -219,8 +219,10 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv)
|
||||
float GB = (float) size * nitr * 2 / (1 << 30);
|
||||
|
||||
print_lock.lock();
|
||||
std::cout << "[" << rank << "][" << tid << "] Runtime of transpose is " << time << " sec\n"
|
||||
<< "The average performance of transpose is " << GB / time << " GBytes/sec"
|
||||
std::cout << "[transpose][" << rank << "][" << tid << "] Runtime of transpose is " << time
|
||||
<< " sec\n";
|
||||
std::cout << "[transpose][" << rank << "][" << tid
|
||||
<< "] The average performance of transpose is " << GB / time << " GBytes/sec"
|
||||
<< std::endl;
|
||||
print_lock.unlock();
|
||||
|
||||
|
||||
@@ -55,4 +55,4 @@ set_tests_properties(
|
||||
ENVIRONMENT
|
||||
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV};HSA_TOOLS_LIB=$<TARGET_FILE:rocprofiler::rocprofiler-shared-library>"
|
||||
FAIL_REGULAR_EXPRESSION
|
||||
"threw an exception")
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
|
||||
|
||||
@@ -145,7 +145,7 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv)
|
||||
if(argc > 3) nsync = atoll(argv[3]);
|
||||
|
||||
auto_lock_t _lk{print_lock};
|
||||
std::cout << "[" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl;
|
||||
std::cout << "[transpose][" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl;
|
||||
_lk.unlock();
|
||||
|
||||
std::default_random_engine _engine{std::random_device{}() * (rank + 1) * (tid + 1)};
|
||||
@@ -173,7 +173,7 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv)
|
||||
dim3 block(32, 32, 1); // transpose_a
|
||||
|
||||
print_lock.lock();
|
||||
printf("[%i][%i] grid=(%i,%i,%i), block=(%i,%i,%i)\n",
|
||||
printf("[transpose][%i][%i] grid=(%i,%i,%i), block=(%i,%i,%i)\n",
|
||||
rank,
|
||||
tid,
|
||||
grid.x,
|
||||
@@ -198,8 +198,10 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv)
|
||||
float GB = (float) size * nitr * 2 / (1 << 30);
|
||||
|
||||
print_lock.lock();
|
||||
std::cout << "[" << rank << "][" << tid << "] Runtime of transpose is " << time << " sec\n"
|
||||
<< "The average performance of transpose is " << GB / time << " GBytes/sec"
|
||||
std::cout << "[transpose][" << rank << "][" << tid << "] Runtime of transpose is " << time
|
||||
<< " sec\n";
|
||||
std::cout << "[transpose][" << rank << "][" << tid
|
||||
<< "] The average performance of transpose is " << GB / time << " GBytes/sec"
|
||||
<< std::endl;
|
||||
print_lock.unlock();
|
||||
|
||||
|
||||
@@ -2,6 +2,11 @@
|
||||
# common utilities for samples
|
||||
#
|
||||
|
||||
# default FAIL_REGULAR_EXPRESSION for tests
|
||||
set(ROCPROFILER_DEFAULT_FAIL_REGEX
|
||||
"threw an exception|Permission denied|Could not create logging file"
|
||||
CACHE STRING "Default FAIL_REGULAR_EXPRESSION for tests")
|
||||
|
||||
# build flags
|
||||
add_library(rocprofiler-samples-build-flags INTERFACE)
|
||||
add_library(rocprofiler::samples-build-flags ALIAS rocprofiler-samples-build-flags)
|
||||
|
||||
@@ -52,7 +52,7 @@ set_tests_properties(
|
||||
ENVIRONMENT
|
||||
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV};HSA_TOOLS_LIB=$<TARGET_FILE:rocprofiler::rocprofiler-shared-library>"
|
||||
FAIL_REGULAR_EXPRESSION
|
||||
"threw an exception")
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
|
||||
|
||||
add_library(counter-collection-callback-client SHARED)
|
||||
target_sources(counter-collection-callback-client PRIVATE callback_client.cpp client.hpp)
|
||||
@@ -80,7 +80,7 @@ set_tests_properties(
|
||||
ENVIRONMENT
|
||||
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV};HSA_TOOLS_LIB=$<TARGET_FILE:rocprofiler::rocprofiler-shared-library>"
|
||||
FAIL_REGULAR_EXPRESSION
|
||||
"threw an exception")
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
|
||||
|
||||
add_library(counter-collection-functional-counter-client SHARED)
|
||||
target_sources(counter-collection-functional-counter-client
|
||||
@@ -109,4 +109,4 @@ set_tests_properties(
|
||||
ENVIRONMENT
|
||||
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV};HSA_TOOLS_LIB=$<TARGET_FILE:rocprofiler::rocprofiler-shared-library>"
|
||||
FAIL_REGULAR_EXPRESSION
|
||||
"threw an exception")
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
|
||||
|
||||
@@ -67,15 +67,14 @@ get_client_ctx()
|
||||
}
|
||||
|
||||
void
|
||||
record_callback(rocprofiler_queue_id_t,
|
||||
rocprofiler_agent_id_t,
|
||||
rocprofiler_correlation_id_t,
|
||||
uint64_t,
|
||||
void* callback_data_args,
|
||||
size_t record_count,
|
||||
rocprofiler_record_counter_t* record_data)
|
||||
record_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
|
||||
rocprofiler_record_counter_t* record_data,
|
||||
size_t record_count,
|
||||
rocprofiler_user_data_t user_data,
|
||||
void* callback_data_args)
|
||||
{
|
||||
std::stringstream ss;
|
||||
ss << "Kernel_id " << dispatch_data.kernel_id << ": ";
|
||||
for(size_t i = 0; i < record_count; ++i)
|
||||
{
|
||||
ss << "(Id: " << record_data[i].id << " Value [D]: " << record_data[i].counter_value
|
||||
@@ -84,6 +83,8 @@ record_callback(rocprofiler_queue_id_t,
|
||||
auto* output_stream = static_cast<std::ostream*>(callback_data_args);
|
||||
if(!output_stream) throw std::runtime_error{"nullptr to output stream"};
|
||||
*output_stream << "[" << __FUNCTION__ << "] " << ss.str() << "\n";
|
||||
|
||||
(void) user_data;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -93,13 +94,10 @@ record_callback(rocprofiler_queue_id_t,
|
||||
* to collect the counter SQ_WAVES for all kernel dispatch packets.
|
||||
*/
|
||||
void
|
||||
dispatch_callback(rocprofiler_queue_id_t /*queue_id*/,
|
||||
const rocprofiler_agent_t* agent,
|
||||
rocprofiler_correlation_id_t /*correlation_id*/,
|
||||
const hsa_kernel_dispatch_packet_t* /*dispatch_packet*/,
|
||||
uint64_t /*kernel_id*/,
|
||||
void* /*callback_data_args*/,
|
||||
rocprofiler_profile_config_id_t* config)
|
||||
dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
|
||||
rocprofiler_profile_config_id_t* config,
|
||||
rocprofiler_user_data_t* /*user_data*/,
|
||||
void* /*callback_data_args*/)
|
||||
{
|
||||
/**
|
||||
* This simple example uses the same profile counter set for all agents.
|
||||
@@ -112,7 +110,7 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/,
|
||||
static std::unordered_map<uint64_t, rocprofiler_profile_config_id_t> profile_cache = {};
|
||||
|
||||
auto search_cache = [&]() {
|
||||
if(auto pos = profile_cache.find(agent->id.handle); pos != profile_cache.end())
|
||||
if(auto pos = profile_cache.find(dispatch_data.agent_id.handle); pos != profile_cache.end())
|
||||
{
|
||||
*config = pos->second;
|
||||
return true;
|
||||
@@ -135,7 +133,7 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/,
|
||||
|
||||
// Iterate through the agents and get the counters available on that agent
|
||||
ROCPROFILER_CALL(rocprofiler_iterate_agent_supported_counters(
|
||||
agent->id,
|
||||
dispatch_data.agent_id,
|
||||
[](rocprofiler_agent_id_t,
|
||||
rocprofiler_counter_id_t* counters,
|
||||
size_t num_counters,
|
||||
@@ -169,11 +167,12 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/,
|
||||
|
||||
// Create a colleciton profile for the counters
|
||||
rocprofiler_profile_config_id_t profile;
|
||||
ROCPROFILER_CALL(rocprofiler_create_profile_config(
|
||||
agent->id, collect_counters.data(), collect_counters.size(), &profile),
|
||||
"Could not construct profile cfg");
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_create_profile_config(
|
||||
dispatch_data.agent_id, collect_counters.data(), collect_counters.size(), &profile),
|
||||
"Could not construct profile cfg");
|
||||
|
||||
profile_cache.emplace(agent->id.handle, profile);
|
||||
profile_cache.emplace(dispatch_data.agent_id.handle, profile);
|
||||
// Return the profile to collect those counters for this dispatch
|
||||
*config = profile;
|
||||
}
|
||||
|
||||
@@ -32,6 +32,7 @@
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/registration.h>
|
||||
#include <rocprofiler-sdk/rocprofiler.h>
|
||||
|
||||
@@ -116,13 +117,10 @@ buffered_callback(rocprofiler_context_id_t,
|
||||
* to collect the counter SQ_WAVES for all kernel dispatch packets.
|
||||
*/
|
||||
void
|
||||
dispatch_callback(rocprofiler_queue_id_t /*queue_id*/,
|
||||
const rocprofiler_agent_t* agent,
|
||||
rocprofiler_correlation_id_t /*correlation_id*/,
|
||||
const hsa_kernel_dispatch_packet_t* /*dispatch_packet*/,
|
||||
uint64_t /*kernel_id*/,
|
||||
void* /*callback_data_args*/,
|
||||
rocprofiler_profile_config_id_t* config)
|
||||
dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
|
||||
rocprofiler_profile_config_id_t* config,
|
||||
rocprofiler_user_data_t* /*user_data*/,
|
||||
void* /*callback_data_args*/)
|
||||
{
|
||||
/**
|
||||
* This simple example uses the same profile counter set for all agents.
|
||||
@@ -135,7 +133,7 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/,
|
||||
static std::unordered_map<uint64_t, rocprofiler_profile_config_id_t> profile_cache = {};
|
||||
|
||||
auto search_cache = [&]() {
|
||||
if(auto pos = profile_cache.find(agent->id.handle); pos != profile_cache.end())
|
||||
if(auto pos = profile_cache.find(dispatch_data.agent_id.handle); pos != profile_cache.end())
|
||||
{
|
||||
*config = pos->second;
|
||||
return true;
|
||||
@@ -158,7 +156,7 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/,
|
||||
|
||||
// Iterate through the agents and get the counters available on that agent
|
||||
ROCPROFILER_CALL(rocprofiler_iterate_agent_supported_counters(
|
||||
agent->id,
|
||||
dispatch_data.agent_id,
|
||||
[](rocprofiler_agent_id_t,
|
||||
rocprofiler_counter_id_t* counters,
|
||||
size_t num_counters,
|
||||
@@ -192,11 +190,12 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/,
|
||||
|
||||
// Create a colleciton profile for the counters
|
||||
rocprofiler_profile_config_id_t profile;
|
||||
ROCPROFILER_CALL(rocprofiler_create_profile_config(
|
||||
agent->id, collect_counters.data(), collect_counters.size(), &profile),
|
||||
"Could not construct profile cfg");
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_create_profile_config(
|
||||
dispatch_data.agent_id, collect_counters.data(), collect_counters.size(), &profile),
|
||||
"Could not construct profile cfg");
|
||||
|
||||
profile_cache.emplace(agent->id.handle, profile);
|
||||
profile_cache.emplace(dispatch_data.agent_id.handle, profile);
|
||||
// Return the profile to collect those counters for this dispatch
|
||||
*config = profile;
|
||||
}
|
||||
|
||||
@@ -189,15 +189,46 @@ buffered_callback(rocprofiler_context_id_t,
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
dispatch_callback(rocprofiler_queue_id_t /*queue_id*/,
|
||||
const rocprofiler_agent_t* agent,
|
||||
rocprofiler_correlation_id_t /*correlation_id*/,
|
||||
const hsa_kernel_dispatch_packet_t* /*dispatch_packet*/,
|
||||
uint64_t /*kernel_id*/,
|
||||
void* /*callback_data_args*/,
|
||||
rocprofiler_profile_config_id_t* config)
|
||||
using agent_map_t = std::map<uint64_t, const rocprofiler_agent_v0_t*>;
|
||||
|
||||
agent_map_t
|
||||
get_agent_info()
|
||||
{
|
||||
auto iterate_cb = [](rocprofiler_agent_version_t agents_ver,
|
||||
const void** agents_arr,
|
||||
size_t num_agents,
|
||||
void* user_data) {
|
||||
if(agents_ver != ROCPROFILER_AGENT_INFO_VERSION_0)
|
||||
throw std::runtime_error{"unexpected rocprofiler agent version"};
|
||||
|
||||
auto* agents_v = static_cast<agent_map_t*>(user_data);
|
||||
for(size_t i = 0; i < num_agents; ++i)
|
||||
{
|
||||
const auto* itr = static_cast<const rocprofiler_agent_v0_t*>(agents_arr[i]);
|
||||
agents_v->emplace(itr->id.handle, itr);
|
||||
}
|
||||
return ROCPROFILER_STATUS_SUCCESS;
|
||||
};
|
||||
|
||||
auto _agents = agent_map_t{};
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0,
|
||||
iterate_cb,
|
||||
sizeof(rocprofiler_agent_t),
|
||||
const_cast<void*>(static_cast<const void*>(&_agents))),
|
||||
"query available agents");
|
||||
|
||||
return _agents;
|
||||
}
|
||||
|
||||
void
|
||||
dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
|
||||
rocprofiler_profile_config_id_t* config,
|
||||
rocprofiler_user_data_t* /*user_data*/,
|
||||
void* /*callback_data_args*/)
|
||||
{
|
||||
static auto agents = get_agent_info();
|
||||
|
||||
auto& cap = *get_capture();
|
||||
auto wlock = std::unique_lock{cap.m_mutex};
|
||||
|
||||
@@ -211,7 +242,7 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/,
|
||||
{
|
||||
std::vector<rocprofiler_counter_id_t> counters_needed;
|
||||
ROCPROFILER_CALL(rocprofiler_iterate_agent_supported_counters(
|
||||
agent->id,
|
||||
dispatch_data.agent_id,
|
||||
[](rocprofiler_agent_id_t,
|
||||
rocprofiler_counter_id_t* counters,
|
||||
size_t num_counters,
|
||||
@@ -237,9 +268,9 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/,
|
||||
"Could not query counter_id");
|
||||
cap.expected_counter_names.emplace(found_counter.handle, std::string(version.name));
|
||||
size_t expected = 0;
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_query_counter_instance_count(agent->id, found_counter, &expected),
|
||||
"COULD NOT QUERY INSTANCES");
|
||||
ROCPROFILER_CALL(rocprofiler_query_counter_instance_count(
|
||||
dispatch_data.agent_id, found_counter, &expected),
|
||||
"COULD NOT QUERY INSTANCES");
|
||||
cap.remaining.push_back(found_counter);
|
||||
cap.expected.emplace(found_counter.handle, expected);
|
||||
|
||||
@@ -266,7 +297,8 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/,
|
||||
}
|
||||
if(cap.expected.empty())
|
||||
{
|
||||
std::clog << "No counters found for agent - " << agent->name;
|
||||
std::clog << "No counters found for agent " << dispatch_data.agent_id.handle << " ("
|
||||
<< agents.at(dispatch_data.agent_id.handle)->name << ")";
|
||||
}
|
||||
}
|
||||
if(cap.remaining.empty()) return;
|
||||
@@ -274,9 +306,9 @@ dispatch_callback(rocprofiler_queue_id_t /*queue_id*/,
|
||||
rocprofiler_profile_config_id_t profile;
|
||||
|
||||
// Select the next counter to collect.
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_create_profile_config(agent->id, &(cap.remaining.back()), 1, &profile),
|
||||
"Could not construct profile cfg");
|
||||
ROCPROFILER_CALL(rocprofiler_create_profile_config(
|
||||
dispatch_data.agent_id, &(cap.remaining.back()), 1, &profile),
|
||||
"Could not construct profile cfg");
|
||||
|
||||
cap.remaining.pop_back();
|
||||
*config = profile;
|
||||
|
||||
@@ -54,4 +54,4 @@ set_tests_properties(
|
||||
ENVIRONMENT
|
||||
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV};HSA_TOOLS_LIB=$<TARGET_FILE:rocprofiler::rocprofiler-shared-library>"
|
||||
FAIL_REGULAR_EXPRESSION
|
||||
"threw an exception")
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
|
||||
|
||||
@@ -145,7 +145,7 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv)
|
||||
if(argc > 3) nsync = atoll(argv[3]);
|
||||
|
||||
auto_lock_t _lk{print_lock};
|
||||
std::cout << "[" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl;
|
||||
std::cout << "[transpose][" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl;
|
||||
_lk.unlock();
|
||||
|
||||
std::default_random_engine _engine{std::random_device{}() * (rank + 1) * (tid + 1)};
|
||||
|
||||
@@ -5,9 +5,16 @@ set -eo pipefail
|
||||
ROCPROFV3_DIR=$(dirname -- "$(realpath "${BASH_SOURCE[0]}")")
|
||||
ROCM_DIR=$(dirname -- "${ROCPROFV3_DIR}")
|
||||
|
||||
: ${HSA_TOOLS_LIB:="${ROCM_DIR}/lib/librocprofiler-sdk.so"}
|
||||
: ${ROCPROFILER_LIBRARY_CTOR:=1}
|
||||
: ${ROCPROF_OUTPUT_PATH:="."}
|
||||
: ${ROCPROF_OUTPUT_PATH_INTERNAL:="."}
|
||||
: ${ROCPROF_OUTPUT_FILE_NAME:=""}
|
||||
: ${ROCPROF_COUNTERS_PATH:=""}
|
||||
: ${ROCPROF_PRELOAD:=""}
|
||||
: ${ROCPROF_TOOL_LIBRARY:="${ROCM_DIR}/lib/rocprofiler-sdk/librocprofiler-sdk-tool.so"}
|
||||
: ${ROCPROF_SDK_LIBRARY:="${ROCM_DIR}/lib/librocprofiler-sdk.so"}
|
||||
|
||||
export HSA_TOOLS_LIB
|
||||
export ROCPROFILER_LIBRARY_CTOR
|
||||
|
||||
# Define color codes
|
||||
GREEN='\033[0;32m'
|
||||
@@ -19,27 +26,40 @@ usage() {
|
||||
if [ -z "${EC}" ]; then EC=1; fi
|
||||
echo -e "${RESET}ROCProfilerV3 Run Script Usage:"
|
||||
echo -e "${GREEN}-h | --help ${RESET} For showing this message"
|
||||
echo -e ""
|
||||
echo -e "${GREEN}--hip-trace ${RESET} For Collecting HIP Traces (runtime + compiler)"
|
||||
echo -e "${GREEN}--hip-runtime-trace ${RESET} For Collecting HIP Runtime API Traces"
|
||||
echo -e "${GREEN}--hip-compiler-trace ${RESET} For Collecting HIP Compiler generated code Traces"
|
||||
echo -e ""
|
||||
echo -e "${GREEN}--marker-trace ${RESET} For Collecting Marker (ROCTx) Traces"
|
||||
echo -e "${GREEN}--kernel-trace ${RESET} For Collecting Kernel Dispatch Traces"
|
||||
echo -e "${GREEN}--memory-copy-trace ${RESET} For Collecting Memory Copy Traces"
|
||||
echo -e ""
|
||||
echo -e "${GREEN}--hsa-trace ${RESET} For Collecting HSA API Traces (core + amd + image + finalizer)"
|
||||
echo -e "${GREEN}--hsa-core-trace ${RESET} For Collecting HSA API Traces (core API)"
|
||||
echo -e "${GREEN}--hsa-amd-trace ${RESET} For Collecting HSA API Traces (AMD-extension API)"
|
||||
echo -e "${GREEN}--hsa-image-trace ${RESET} For Collecting HSA API Traces (Image-extenson API)"
|
||||
echo -e "${GREEN}--hsa-finalizer-trace ${RESET} For Collecting HSA API Traces (Finalizer-extension API)"
|
||||
echo -e ""
|
||||
echo -e "${GREEN}--sys-trace ${RESET} For Collecting HIP,HSA, Memory Copy, (marker)ROCTx and Kernel dispatch traces\n"
|
||||
echo -e ""
|
||||
echo -e "${GREEN}-o | --output-file ${RESET} For the output file name"
|
||||
echo -e "\t#${GREY} usage (with current dir): rocprofv3 --hsa-trace -o <file_name> <executable>"
|
||||
echo -e "\t#${GREY} usage (with custom dir): rocprofv3 --hsa-trace -d <out_dir> -o <file_name> <executable>${RESET}\n"
|
||||
echo -e ""
|
||||
echo -e "${GREEN}-d | --output-directory ${RESET} For adding output path where the output files will be saved"
|
||||
echo -e "\t#${GREY} usage (with custom dir): rocprofv3 --hsa-trace -d <out_dir> <executable>${RESET}"
|
||||
echo -e ""
|
||||
echo -e "${GREEN}-M | --mangled-kernels ${RESET} Do not demangle the kernel names"
|
||||
echo -e "${GREEN}-T | --truncate-kernels ${RESET} Truncate the demangled kernel names"
|
||||
echo -e ""
|
||||
echo -e "${GREEN}-L | --list-metrics ${RESET} List metrics for counter collection"
|
||||
echo -e "${GREEN}-i | --input ${RESET} For counter collection "
|
||||
echo -e "\t#${GREY} Input file .txt format, automatically rerun application for every profiling features line"
|
||||
echo -e "\t# Perf counters group 1"
|
||||
echo -e "\tpmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts"
|
||||
echo -e "\t# Perf counters group 2"
|
||||
echo -e "\tpmc : WriteSize L2CacheHit ${RESET}"
|
||||
echo -e "${GREEN}--hsa-trace ${RESET} For Collecting HSA API Traces"
|
||||
echo -e "${GREEN}--kernel-trace ${RESET} For Collecting Kernel Dispatch Traces"
|
||||
echo -e "${GREEN}--memory-copy-trace ${RESET} For Collecting Memory Copy Traces"
|
||||
echo -e "${GREEN}--marker-trace ${RESET} For Collecting Marker (ROCTx) Traces"
|
||||
echo -e "${GREEN}--hip-trace ${RESET} For Collecting HIP Runtime Traces"
|
||||
echo -e "${GREEN}--hip-compiler-trace ${RESET} For Collecting HIP Compiler generated code Traces"
|
||||
echo -e "${GREEN}--sys-trace ${RESET} For Collecting HIP,HSA, Memory Copy, (marker)ROCTx and Kernel dispatch traces\n"
|
||||
echo -e "${GREEN}-o | --output-file ${RESET} For the output file name"
|
||||
echo -e "\t#${GREY} usage e.g:(with current dir): rocprofv3 --hsa-trace -o <file_name> <executable>"
|
||||
echo -e "\t#${GREY} usage e.g:(with custom dir): rocprofv3 --hsa-trace -d <out_dir> -o <file_name> <executable>${RESET}\n"
|
||||
echo -e "${GREEN}-d | --output-directory ${RESET} For adding output path where the output files will be saved"
|
||||
echo -e "\t#${GREY} usage e.g:(with custom dir): rocprofv3 --hsa-trace -d <out_dir> <executable>${RESET}"
|
||||
echo -e "${GREEN}-M | --mangled-kernels ${RESET} Do not demangle the kernel names"
|
||||
echo -e "${GREEN}-T | --truncate-kernels ${RESET} Truncate the demangled kernel names"
|
||||
echo -e "${GREEN}-L | --list-metrics ${RESET} List metrics"
|
||||
echo -e ""
|
||||
exit ${EC}
|
||||
}
|
||||
@@ -48,11 +68,22 @@ if [ -z "$1" ]; then
|
||||
usage 1
|
||||
fi
|
||||
|
||||
: ${ROCPROF_OUTPUT_PATH:="."}
|
||||
: ${ROCPROF_OUTPUT_PATH_INTERNAL:="."}
|
||||
: ${ROCPROF_OUTPUT_FILE_NAME:=""}
|
||||
: ${ROCPROF_COUNTERS_PATH:=""}
|
||||
: ${ROCPROF_PRELOAD:=""}
|
||||
if [ -n "${ROCPROF_PRELOAD}" ]; then
|
||||
ROCPROF_PRELOAD="${ROCPROF_PRELOAD}:${ROCPROF_TOOL_LIBRARY}:${ROCPROF_SDK_LIBRARY}"
|
||||
else
|
||||
ROCPROF_PRELOAD="${ROCPROF_TOOL_LIBRARY}:${ROCPROF_SDK_LIBRARY}"
|
||||
fi
|
||||
|
||||
if [ -n "${ROCP_TOOL_LIBRARIES}" ]; then
|
||||
ROCP_TOOL_LIBRARIES="${ROCP_TOOL_LIBRARIES}:${ROCPROF_TOOL_LIBRARY}"
|
||||
else
|
||||
ROCP_TOOL_LIBRARIES="${ROCPROF_TOOL_LIBRARY}"
|
||||
fi
|
||||
|
||||
LD_LIBRARY_PATH=${ROCM_DIR}/lib:${LD_LIBRARY_PATH}
|
||||
|
||||
export ROCP_TOOL_LIBRARIES
|
||||
export LD_LIBRARY_PATH
|
||||
|
||||
while true; do
|
||||
if [[ "$1" == "-h" || "$1" == "--help" ]]; then
|
||||
@@ -93,14 +124,26 @@ while true; do
|
||||
shift
|
||||
shift
|
||||
elif [ "$1" == "--hsa-trace" ]; then
|
||||
export ROCPROF_HSA_API_TRACE=1
|
||||
export ROCPROF_HSA_CORE_API_TRACE=1
|
||||
export ROCPROF_HSA_AMD_EXT_API_TRACE=1
|
||||
export ROCPROF_HSA_IMAGE_EXT_API_TRACE=1
|
||||
export ROCPROF_HSA_FINALIZER_EXT_API_TRACE=1
|
||||
shift
|
||||
elif [ "$1" == "--hsa-core-trace" ]; then
|
||||
export ROCPROF_HSA_CORE_API_TRACE=1
|
||||
shift
|
||||
elif [ "$1" == "--hsa-amd-trace" ]; then
|
||||
export ROCPROF_HSA_AMD_EXT_API_TRACE=1
|
||||
shift
|
||||
elif [ "$1" == "--hsa-image-trace" ]; then
|
||||
export ROCPROF_HSA_IMAGE_EXT_API_TRACE=1
|
||||
shift
|
||||
elif [ "$1" == "--hsa-finalizer-trace" ]; then
|
||||
export ROCPROF_HSA_FINALIZER_EXT_API_TRACE=1
|
||||
shift
|
||||
elif [[ "$1" == "-L" || "$1" == "--list-metrics" ]]; then
|
||||
export ROCPROF_LIST_METRICS=1
|
||||
ROCP_TOOL_LIBRARIES="${ROCM_DIR}/lib/rocprofiler-sdk/librocprofiler-sdk-tool.so" \
|
||||
LD_LIBRARY_PATH=${ROCM_DIR}/lib:${LD_LIBRARY_PATH} \
|
||||
LD_PRELOAD="${ROCPROF_PRELOAD}:${ROCM_DIR}/lib/librocprofiler-sdk.so" \
|
||||
exec ${ROCM_DIR}/lib/rocprofiler-sdk/rocprofv3-trigger-list-metrics
|
||||
LD_PRELOAD="${ROCPROF_PRELOAD}" exec ${ROCM_DIR}/lib/rocprofiler-sdk/rocprofv3-trigger-list-metrics
|
||||
elif [ "$1" == "--kernel-trace" ]; then
|
||||
export ROCPROF_KERNEL_TRACE=1
|
||||
shift
|
||||
@@ -111,17 +154,25 @@ while true; do
|
||||
export ROCPROF_MARKER_API_TRACE=1
|
||||
shift
|
||||
elif [ "$1" == "--hip-trace" ]; then
|
||||
export ROCPROF_HIP_API_TRACE=1
|
||||
export ROCPROF_HIP_RUNTIME_API_TRACE=1
|
||||
export ROCPROF_HIP_COMPILER_API_TRACE=1
|
||||
shift
|
||||
elif [ "$1" == "--hip-runtime-trace" ]; then
|
||||
export ROCPROF_HIP_RUNTIME_API_TRACE=1
|
||||
shift
|
||||
elif [ "$1" == "--hip-compiler-trace" ]; then
|
||||
export ROCPROF_HIP_COMPILER_API_TRACE=1
|
||||
shift
|
||||
elif [ "$1" == "--sys-trace" ]; then
|
||||
export ROCPROF_HSA_API_TRACE=1
|
||||
export ROCPROF_HSA_CORE_API_TRACE=1
|
||||
export ROCPROF_HSA_AMD_EXT_API_TRACE=1
|
||||
export ROCPROF_HSA_IMAGE_EXT_API_TRACE=1
|
||||
export ROCPROF_HSA_FINALIZER_EXT_API_TRACE=1
|
||||
export ROCPROF_HIP_RUNTIME_API_TRACE=1
|
||||
export ROCPROF_HIP_COMPILER_API_TRACE=1
|
||||
export ROCPROF_MARKER_API_TRACE=1
|
||||
export ROCPROF_KERNEL_TRACE=1
|
||||
export ROCPROF_MEMORY_COPY_TRACE=1
|
||||
export ROCPROF_MARKER_API_TRACE=1
|
||||
export ROCPROF_HIP_API_TRACE=1
|
||||
shift
|
||||
elif [ "$1" == "--" ]; then
|
||||
shift
|
||||
@@ -161,18 +212,12 @@ if [ -n "${PMC_LINES:-}" ]; then
|
||||
export ROCPROF_OUTPUT_PATH=$RESULT_PATH
|
||||
fi
|
||||
((COUNTER++))
|
||||
ROCP_TOOL_LIBRARIES="${ROCM_DIR}/lib/rocprofiler-sdk/librocprofiler-sdk-tool.so" \
|
||||
LD_LIBRARY_PATH=${ROCM_DIR}/lib:${LD_LIBRARY_PATH} \
|
||||
LD_PRELOAD="${ROCPROF_PRELOAD}:${ROCM_DIR}/lib/librocprofiler-sdk.so" \
|
||||
"${@}"
|
||||
LD_PRELOAD="${ROCPROF_PRELOAD}" "${@}"
|
||||
if [ -n "$ROCPROF_OUTPUT_PATH" ]; then
|
||||
echo -e "\nThe output path for the following counters: $ROCPROF_OUTPUT_PATH"
|
||||
fi
|
||||
done
|
||||
else
|
||||
# for non counter collection. e.g: tracing
|
||||
ROCP_TOOL_LIBRARIES="${ROCM_DIR}/lib/rocprofiler-sdk/librocprofiler-sdk-tool.so" \
|
||||
LD_LIBRARY_PATH=${ROCM_DIR}/lib:${LD_LIBRARY_PATH} \
|
||||
LD_PRELOAD="${ROCPROF_PRELOAD}:${ROCM_DIR}/lib/librocprofiler-sdk.so" \
|
||||
"${@}"
|
||||
LD_PRELOAD="${ROCPROF_PRELOAD}" exec "${@}"
|
||||
fi
|
||||
|
||||
@@ -30,6 +30,8 @@
|
||||
|
||||
#include <hsa/hsa_ven_amd_loader.h>
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
ROCPROFILER_EXTERN_C_INIT
|
||||
|
||||
/**
|
||||
@@ -141,9 +143,14 @@ typedef struct
|
||||
uint32_t kernarg_segment_alignment; ///< Alignment (in bytes) of the buffer used to pass
|
||||
///< arguments to the kernel
|
||||
uint32_t group_segment_size; ///< Size of static group segment memory required by the kernel
|
||||
///< (per work-group), in bytes
|
||||
///< (per work-group), in bytes. AKA: LDS size
|
||||
uint32_t private_segment_size; ///< Size of static private, spill, and arg segment memory
|
||||
///< required by this kernel (per work-item), in bytes.
|
||||
///< required by this kernel (per work-item), in bytes. AKA:
|
||||
///< scratch size
|
||||
uint32_t sgpr_count; ///< Scalar general purpose register count
|
||||
uint32_t arch_vgpr_count; ///< Architecture vector general purpose register count
|
||||
uint32_t accum_vgpr_count; ///< Accum vector general purpose register count
|
||||
|
||||
} rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t;
|
||||
|
||||
/**
|
||||
|
||||
@@ -37,6 +37,23 @@ ROCPROFILER_EXTERN_C_INIT
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief Kernel dispatch data for profile counting callbacks
|
||||
*
|
||||
*/
|
||||
typedef struct rocprofiler_profile_counting_dispatch_data_t
|
||||
{
|
||||
uint64_t size; ///< Size of this struct
|
||||
rocprofiler_kernel_id_t kernel_id; ///< Kernel identifier
|
||||
rocprofiler_agent_id_t agent_id; ///< Agent ID where kernel is launched
|
||||
rocprofiler_queue_id_t queue_id; ///< Queue ID where kernel packet is enqueued
|
||||
rocprofiler_correlation_id_t correlation_id; ///< Correlation ID for this dispatch
|
||||
uint32_t private_segment_size; /// runtime private memory segment size
|
||||
uint32_t group_segment_size; /// runtime group memory segment size
|
||||
rocprofiler_dim3_t workgroup_size; /// runtime workgroup size (grid * threads)
|
||||
rocprofiler_dim3_t grid_size; /// runtime grid size
|
||||
} rocprofiler_profile_counting_dispatch_data_t;
|
||||
|
||||
/**
|
||||
* @brief Kernel Dispatch Callback. This is a callback that is invoked before the kernel
|
||||
* is enqueued into the HSA queue. What counters to collect for a kernel are set
|
||||
@@ -44,22 +61,35 @@ ROCPROFILER_EXTERN_C_INIT
|
||||
* will be collected and emplaced in the buffer with @ref rocprofiler_buffer_id_t used when
|
||||
* setting up this callback.
|
||||
*
|
||||
* @param [in] queue_id Queue the kernel dispatch packet is being enqueued onto
|
||||
* @param [in] agent Agent of this queue
|
||||
* @param [in] correlation_id Correlation ID for this dispatch
|
||||
* @param [in] dispatch_packet Kernel dispatch packet about to be enqueued into HSA
|
||||
* @param [in] kernel_id Kernel identifier
|
||||
* @param [in] dispatch_data @see ::rocprofiler_profile_counting_dispatch_data_t
|
||||
* @param [out] config Profile config detailing the counters to collect for this kernel
|
||||
* @param [out] user_data User data unique to this dispatch. Returned in record callback
|
||||
* @param [in] callback_data_args Callback supplied via buffered_dispatch_profile_counting_service
|
||||
* @param [out] config Profile config detailing the counters to collect for this kernel
|
||||
*/
|
||||
typedef void (*rocprofiler_profile_counting_dispatch_callback_t)(
|
||||
rocprofiler_queue_id_t queue_id,
|
||||
const rocprofiler_agent_t* agent,
|
||||
rocprofiler_correlation_id_t correlation_id,
|
||||
const hsa_kernel_dispatch_packet_t* dispatch_packet,
|
||||
uint64_t kernel_id,
|
||||
void* callback_data_args,
|
||||
rocprofiler_profile_config_id_t* config);
|
||||
rocprofiler_profile_counting_dispatch_data_t dispatch_data,
|
||||
rocprofiler_profile_config_id_t* config,
|
||||
rocprofiler_user_data_t* user_data,
|
||||
void* callback_data_args);
|
||||
|
||||
/**
|
||||
* @brief Counting record callback. This is a callback is invoked when the kernel
|
||||
* execution is complete and contains the counter profile data requested in
|
||||
* @ref rocprofiler_profile_counting_dispatch_callback_t. Only used with
|
||||
* @ref rocprofiler_configure_callback_dispatch_profile_counting_service.
|
||||
*
|
||||
* @param [in] dispatch_data @see ::rocprofiler_profile_counting_dispatch_data_t
|
||||
* @param [in] record_data Counter record data.
|
||||
* @param [in] record_count Number of counter records.
|
||||
* @param [in] user_data User data instance from dispatch callback
|
||||
* @param [in] callback_data_args Callback supplied via buffered_dispatch_profile_counting_service
|
||||
*/
|
||||
typedef void (*rocprofiler_profile_counting_record_callback_t)(
|
||||
rocprofiler_profile_counting_dispatch_data_t dispatch_data,
|
||||
rocprofiler_record_counter_t* record_data,
|
||||
size_t record_count,
|
||||
rocprofiler_user_data_t user_data,
|
||||
void* callback_data_args);
|
||||
|
||||
/**
|
||||
* @brief Configure buffered dispatch profile Counting Service.
|
||||
@@ -95,29 +125,6 @@ rocprofiler_configure_buffered_dispatch_profile_counting_service(
|
||||
rocprofiler_profile_counting_dispatch_callback_t callback,
|
||||
void* callback_data_args);
|
||||
|
||||
/**
|
||||
* @brief Counting record callback. This is a callback is invoked when the kernel
|
||||
* execution is complete and contains the counter profile data requested in
|
||||
* @ref rocprofiler_profile_counting_dispatch_callback_t. Only used with
|
||||
* @ref rocprofiler_configure_callback_dispatch_profile_counting_service.
|
||||
*
|
||||
* @param [in] queue_id Queue the kernel dispatch packet is being enqueued onto
|
||||
* @param [in] agent Agent of this queue
|
||||
* @param [in] correlation_id Correlation ID for this dispatch
|
||||
* @param [in] kernel_id Kernel identifier
|
||||
* @param [in] callback_data_args Callback supplied via buffered_dispatch_profile_counting_service
|
||||
* @param [in] record_count Number of counter records.
|
||||
* @param [in] record_data Counter record data.
|
||||
*/
|
||||
typedef void (*rocprofiler_profile_counting_record_callback_t)(
|
||||
rocprofiler_queue_id_t queue_id,
|
||||
rocprofiler_agent_id_t agent,
|
||||
rocprofiler_correlation_id_t correlation_id,
|
||||
uint64_t kernel_id,
|
||||
void* callback_data_args,
|
||||
size_t record_count,
|
||||
rocprofiler_record_counter_t* record_data);
|
||||
|
||||
/**
|
||||
* @brief Configure buffered dispatch profile Counting Service.
|
||||
* Collects the counters in dispatch packets and calls a callback
|
||||
|
||||
@@ -261,26 +261,16 @@ typedef enum
|
||||
ROCPROFILER_TABLE_LAST = ROCPROFILER_MARKER_NAME_TABLE,
|
||||
} rocprofiler_intercept_table_t;
|
||||
|
||||
/**
|
||||
* @brief Enumeration for specifying the data type contained within the union.
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
ROCPROFILER_UNION_TYPE_NONE = 0, ///< No union type
|
||||
ROCPROFILER_UNION_TYPE_STRING, ///< String Type set
|
||||
ROCPROFILER_UNION_TYPE_INT, ///< Integer Type Set
|
||||
ROCPROFILER_UNION_TYPE_LAST,
|
||||
} rocprofiler_union_type_t;
|
||||
|
||||
/**
|
||||
* @brief Enumeration for specifying the counter info struct version you want.
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
ROCPROFILER_COUNTER_INFO_VERSION_NONE,
|
||||
ROCPROFILER_COUNTER_INFO_VERSION_0, ///< @see rocprofiler_counter_info_v0_t
|
||||
ROCPROFILER_COUNTER_INFO_VERSION_0, ///< @see ::rocprofiler_counter_info_v0_t
|
||||
ROCPROFILER_COUNTER_INFO_VERSION_LAST,
|
||||
} rocprofiler_counter_info_version_id_t;
|
||||
|
||||
//--------------------------------------------------------------------------------------//
|
||||
//
|
||||
// ALIASES
|
||||
|
||||
@@ -112,9 +112,27 @@ get_env(std::string_view env_id, Tp _default, std::enable_if_t<std::is_integral<
|
||||
return _default;
|
||||
}
|
||||
|
||||
int
|
||||
set_env(std::string_view env_id, bool value, int override)
|
||||
{
|
||||
return ::setenv(env_id.data(), (value) ? "1" : "0", override);
|
||||
}
|
||||
|
||||
template <typename Tp>
|
||||
int
|
||||
set_env(std::string_view env_id, Tp value, int override)
|
||||
{
|
||||
auto str_value = std::stringstream{};
|
||||
str_value << value;
|
||||
return ::setenv(env_id.data(), str_value.str().c_str(), override);
|
||||
}
|
||||
|
||||
#define SPECIALIZE_GET_ENV(TYPE) \
|
||||
template TYPE get_env<TYPE>( \
|
||||
std::string_view, TYPE, std::enable_if_t<std::is_integral<TYPE>::value, sfinae>);
|
||||
std::string_view, TYPE, std::enable_if_t<std::is_integral<TYPE>::value, sfinae>); \
|
||||
template int set_env<TYPE>(std::string_view, TYPE, int);
|
||||
|
||||
#define SPECIALIZE_SET_ENV(TYPE) template int set_env<TYPE>(std::string_view, TYPE, int);
|
||||
|
||||
SPECIALIZE_GET_ENV(int8_t)
|
||||
SPECIALIZE_GET_ENV(int16_t)
|
||||
@@ -124,6 +142,11 @@ SPECIALIZE_GET_ENV(uint8_t)
|
||||
SPECIALIZE_GET_ENV(uint16_t)
|
||||
SPECIALIZE_GET_ENV(uint32_t)
|
||||
SPECIALIZE_GET_ENV(uint64_t)
|
||||
|
||||
SPECIALIZE_SET_ENV(const char*)
|
||||
SPECIALIZE_SET_ENV(std::string)
|
||||
SPECIALIZE_SET_ENV(float)
|
||||
SPECIALIZE_SET_ENV(double)
|
||||
} // namespace impl
|
||||
} // namespace common
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -48,6 +48,13 @@ get_env(std::string_view, bool);
|
||||
|
||||
template <typename Tp>
|
||||
Tp get_env(std::string_view, Tp, std::enable_if_t<std::is_integral<Tp>::value, sfinae> = {});
|
||||
|
||||
int
|
||||
set_env(std::string_view, bool, int override = 0);
|
||||
|
||||
template <typename Tp>
|
||||
int
|
||||
set_env(std::string_view, Tp, int override = 0);
|
||||
} // namespace impl
|
||||
|
||||
template <typename Tp>
|
||||
@@ -66,6 +73,13 @@ get_env(std::string_view env_id, Tp&& _default)
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Tp>
|
||||
inline auto
|
||||
set_env(std::string_view env_id, Tp&& value, int override = 0)
|
||||
{
|
||||
return impl::set_env(env_id, std::forward<Tp>(value), override);
|
||||
}
|
||||
|
||||
struct env_config
|
||||
{
|
||||
std::string env_name = {};
|
||||
|
||||
@@ -28,17 +28,28 @@
|
||||
|
||||
#include <fstream>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace rocprofiler
|
||||
{
|
||||
namespace common
|
||||
{
|
||||
namespace
|
||||
{
|
||||
void
|
||||
init_logging(std::string_view env_var)
|
||||
install_failure_signal_handler()
|
||||
{
|
||||
static auto _once = std::once_flag{};
|
||||
std::call_once(_once, [env_var]() {
|
||||
std::call_once(_once, []() { google::InstallFailureSignalHandler(); });
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void
|
||||
init_logging(std::string_view env_var, logging_config cfg)
|
||||
{
|
||||
static auto _once = std::once_flag{};
|
||||
std::call_once(_once, [env_var, &cfg]() {
|
||||
auto get_argv0 = []() {
|
||||
auto ifs = std::ifstream{"/proc/self/cmdline"};
|
||||
auto sarg = std::string{};
|
||||
@@ -50,18 +61,16 @@ init_logging(std::string_view env_var)
|
||||
return sarg;
|
||||
};
|
||||
|
||||
static auto argv0 = get_argv0();
|
||||
google::InitGoogleLogging(argv0.c_str());
|
||||
auto loglvl = common::get_env(env_var, "error");
|
||||
auto loglvl = common::get_env(env_var, "");
|
||||
for(auto& itr : loglvl)
|
||||
itr = tolower(itr);
|
||||
// default to warning
|
||||
auto loglvl_v = google::WARNING;
|
||||
if(loglvl.find_first_not_of("0123456789") == std::string::npos)
|
||||
auto& loglvl_v = cfg.loglevel;
|
||||
if(!loglvl.empty() && loglvl.find_first_not_of("0123456789") == std::string::npos)
|
||||
{
|
||||
loglvl_v = std::stoul(loglvl);
|
||||
}
|
||||
else
|
||||
else if(!loglvl.empty())
|
||||
{
|
||||
const auto opts =
|
||||
std::unordered_map<std::string_view, uint32_t>{{"info", google::INFO},
|
||||
@@ -77,10 +86,39 @@ init_logging(std::string_view env_var)
|
||||
loglvl_v = opts.at(loglvl);
|
||||
}
|
||||
|
||||
FLAGS_minloglevel = loglvl_v;
|
||||
FLAGS_stderrthreshold = loglvl_v;
|
||||
update_logging(cfg, true);
|
||||
|
||||
if(!google::IsGoogleLoggingInitialized())
|
||||
{
|
||||
static auto argv0 = get_argv0();
|
||||
google::InitGoogleLogging(argv0.c_str());
|
||||
}
|
||||
|
||||
update_logging(cfg);
|
||||
|
||||
LOG(INFO) << "logging initialized via " << env_var;
|
||||
});
|
||||
}
|
||||
|
||||
void
|
||||
update_logging(const logging_config& cfg, bool setup_env, int env_override)
|
||||
{
|
||||
static auto _mtx = std::mutex{};
|
||||
auto _lk = std::unique_lock<std::mutex>{_mtx};
|
||||
|
||||
FLAGS_timestamp_in_logfile_name = false;
|
||||
FLAGS_minloglevel = cfg.loglevel;
|
||||
FLAGS_stderrthreshold = cfg.loglevel;
|
||||
FLAGS_logtostderr = cfg.logtostderr;
|
||||
FLAGS_alsologtostderr = cfg.alsologtostderr;
|
||||
if(cfg.install_failure_handler) install_failure_signal_handler();
|
||||
|
||||
if(setup_env)
|
||||
{
|
||||
common::set_env("GOOGLE_LOG_DIR", get_env("PWD", ""), env_override);
|
||||
common::set_env("GOOGLE_LOGTOSTDERR", cfg.loglevel, env_override);
|
||||
common::set_env("GOOGLE_ALSOLOGTOSTDERR", cfg.alsologtostderr, env_override);
|
||||
}
|
||||
}
|
||||
} // namespace common
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -22,13 +22,27 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <glog/logging.h>
|
||||
|
||||
#include <cstdint>
|
||||
#include <string_view>
|
||||
|
||||
namespace rocprofiler
|
||||
{
|
||||
namespace common
|
||||
{
|
||||
struct logging_config
|
||||
{
|
||||
bool install_failure_handler = false;
|
||||
bool logtostderr = true;
|
||||
bool alsologtostderr = false;
|
||||
int32_t loglevel = google::WARNING;
|
||||
};
|
||||
|
||||
void
|
||||
init_logging(std::string_view env_var);
|
||||
}
|
||||
init_logging(std::string_view env_var, logging_config cfg = logging_config{});
|
||||
|
||||
void
|
||||
update_logging(const logging_config& cfg, bool setup_env = false, int env_override = 0);
|
||||
} // namespace common
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -12,7 +12,6 @@ add_subdirectory(plugins)
|
||||
target_link_libraries(
|
||||
rocprofiler-sdk-tool
|
||||
PRIVATE rocprofiler::rocprofiler-shared-library
|
||||
rocprofiler::rocprofiler-hsa-runtime
|
||||
rocprofiler::rocprofiler-headers
|
||||
rocprofiler::rocprofiler-build-flags
|
||||
rocprofiler::rocprofiler-memcheck
|
||||
|
||||
@@ -57,19 +57,22 @@ struct config
|
||||
{
|
||||
config();
|
||||
|
||||
bool demangle = get_env("ROCPROF_DEMANGLE_KERNELS", true);
|
||||
bool truncate = get_env("ROCPROF_TRUNCATE_KERNELS", false);
|
||||
bool kernel_trace = get_env("ROCPROF_KERNEL_TRACE", false);
|
||||
bool hsa_api_trace = get_env("ROCPROF_HSA_API_TRACE", false);
|
||||
bool marker_api_trace = get_env("ROCPROF_MARKER_API_TRACE", false);
|
||||
bool memory_copy_trace = get_env("ROCPROF_MEMORY_COPY_TRACE", false);
|
||||
bool counter_collection = get_env("ROCPROF_COUNTER_COLLECTION", false);
|
||||
bool hip_api_trace = get_env("ROCPROF_HIP_API_TRACE", false);
|
||||
bool hip_compiler_api_trace = get_env("ROCPROF_HIP_COMPILER_API_TRACE", false);
|
||||
bool list_metrics = get_env("ROCPROF_LIST_METRICS", false);
|
||||
bool list_metrics_output_file = get_env("ROCPROF_OUTPUT_LIST_METRICS_FILE", false);
|
||||
int mpi_size = get_mpi_size();
|
||||
int mpi_rank = get_mpi_rank();
|
||||
bool demangle = get_env("ROCPROF_DEMANGLE_KERNELS", true);
|
||||
bool truncate = get_env("ROCPROF_TRUNCATE_KERNELS", false);
|
||||
bool kernel_trace = get_env("ROCPROF_KERNEL_TRACE", false);
|
||||
bool hsa_core_api_trace = get_env("ROCPROF_HSA_CORE_API_TRACE", false);
|
||||
bool hsa_amd_ext_api_trace = get_env("ROCPROF_HSA_AMD_EXT_API_TRACE", false);
|
||||
bool hsa_image_ext_api_trace = get_env("ROCPROF_HSA_IMAGE_EXT_API_TRACE", false);
|
||||
bool hsa_finalizer_ext_api_trace = get_env("ROCPROF_HSA_FINALIZER_EXT_API_TRACE", false);
|
||||
bool marker_api_trace = get_env("ROCPROF_MARKER_API_TRACE", false);
|
||||
bool memory_copy_trace = get_env("ROCPROF_MEMORY_COPY_TRACE", false);
|
||||
bool counter_collection = get_env("ROCPROF_COUNTER_COLLECTION", false);
|
||||
bool hip_runtime_api_trace = get_env("ROCPROF_HIP_RUNTIME_API_TRACE", false);
|
||||
bool hip_compiler_api_trace = get_env("ROCPROF_HIP_COMPILER_API_TRACE", false);
|
||||
bool list_metrics = get_env("ROCPROF_LIST_METRICS", false);
|
||||
bool list_metrics_output_file = get_env("ROCPROF_OUTPUT_LIST_METRICS_FILE", false);
|
||||
int mpi_size = get_mpi_size();
|
||||
int mpi_rank = get_mpi_rank();
|
||||
std::string output_path = get_env("ROCPROF_OUTPUT_PATH", fs::current_path().string());
|
||||
std::string output_file = get_env("ROCPROF_OUTPUT_FILE_NAME", std::to_string(getpid()));
|
||||
std::vector<std::string> kernel_names = {};
|
||||
|
||||
@@ -22,7 +22,8 @@
|
||||
|
||||
#include "helper.hpp"
|
||||
#include "config.hpp"
|
||||
#include "rocprofiler-sdk/fwd.h"
|
||||
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
|
||||
#include <glog/logging.h>
|
||||
|
||||
@@ -33,198 +34,6 @@
|
||||
#include <unordered_set>
|
||||
#include <utility>
|
||||
|
||||
namespace
|
||||
{
|
||||
using amd_compute_pgm_rsrc_three32_t = uint32_t;
|
||||
|
||||
// AMD Compute Program Resource Register Three.
|
||||
enum amd_compute_gfx9_pgm_rsrc_three_t
|
||||
{
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_ACCUM_OFFSET, 0, 5),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_TG_SPLIT, 16, 1)
|
||||
};
|
||||
|
||||
enum amd_compute_gfx10_gfx11_pgm_rsrc_three_t
|
||||
{
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_SHARED_VGPR_COUNT, 0, 4),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_INST_PREF_SIZE, 4, 6),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_TRAP_ON_START, 10, 1),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_TRAP_ON_END, 11, 1),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_IMAGE_OP, 31, 1)
|
||||
};
|
||||
|
||||
// Kernel code properties.
|
||||
enum amd_kernel_code_property_t
|
||||
{
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER,
|
||||
0,
|
||||
1),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, 1, 1),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, 2, 1),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR,
|
||||
3,
|
||||
1),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, 4, 1),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
|
||||
6,
|
||||
1),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_RESERVED0, 7, 3),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
|
||||
10,
|
||||
1), // GFX10+
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK, 11, 1),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_RESERVED1, 12, 4),
|
||||
};
|
||||
|
||||
std::unordered_map<rocprofiler_address_t, const char*> kernel_descriptor_name_map;
|
||||
|
||||
std::mutex kernel_properties_correlation_mutex;
|
||||
std::unordered_map<uint64_t, rocprofiler_tool_kernel_properties_t>
|
||||
kernel_properties_correlation_map;
|
||||
|
||||
uint32_t
|
||||
arch_vgpr_count(const std::string_view& name, const kernel_descriptor_t& kernel_code)
|
||||
{
|
||||
std::string info_name(name.data(), name.size());
|
||||
if(strcmp(name.data(), "gfx90a") == 0 || strncmp(name.data(), "gfx94", 5) == 0)
|
||||
return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc3,
|
||||
AMD_COMPUTE_PGM_RSRC_THREE_ACCUM_OFFSET) +
|
||||
1) *
|
||||
4;
|
||||
|
||||
return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1,
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT) +
|
||||
1) *
|
||||
(AMD_HSA_BITS_GET(kernel_code.kernel_code_properties,
|
||||
AMD_KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32)
|
||||
? 8
|
||||
: 4);
|
||||
}
|
||||
|
||||
uint32_t
|
||||
accum_vgpr_count(const std::string_view& name, const kernel_descriptor_t& kernel_code)
|
||||
{
|
||||
std::string info_name(name.data(), name.size());
|
||||
if(strcmp(info_name.c_str(), "gfx908") == 0) return arch_vgpr_count(name, kernel_code);
|
||||
if(strcmp(info_name.c_str(), "gfx90a") == 0 || strncmp(info_name.c_str(), "gfx94", 5) == 0)
|
||||
return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1,
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT) +
|
||||
1) *
|
||||
8 -
|
||||
arch_vgpr_count(name, kernel_code);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint32_t
|
||||
sgpr_count(const std::string_view& name, const kernel_descriptor_t& kernel_code)
|
||||
{
|
||||
// GFX10 and later always allocate 128 sgprs.
|
||||
|
||||
// TODO(srnagara): Recheck the extraction of gfxip from gpu name
|
||||
|
||||
const char* name_data = name.data();
|
||||
const size_t gfxip_label_len = std::min(name.size() - 2, size_t{63});
|
||||
if(gfxip_label_len > 0 && strnlen(name_data, gfxip_label_len + 1) >= gfxip_label_len)
|
||||
{
|
||||
auto gfxip = std::vector<char>{};
|
||||
gfxip.resize(gfxip_label_len + 1, '\0');
|
||||
memcpy(gfxip.data(), name_data, gfxip_label_len);
|
||||
// TODO(srnagara): Check if it is hardcoded
|
||||
if(std::stoi(&gfxip.at(3)) >= 10) return 128;
|
||||
return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1,
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT) /
|
||||
2 +
|
||||
1) *
|
||||
16;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
const auto&
|
||||
GetLoaderTable()
|
||||
{
|
||||
static const auto _v = []() {
|
||||
using hsa_loader_table_t = hsa_ven_amd_loader_1_01_pfn_t;
|
||||
auto _tbl = hsa_loader_table_t{};
|
||||
memset(&_tbl, 0, sizeof(hsa_loader_table_t));
|
||||
hsa_system_get_major_extension_table(
|
||||
HSA_EXTENSION_AMD_LOADER, 1, sizeof(hsa_loader_table_t), &_tbl);
|
||||
return _tbl;
|
||||
}();
|
||||
return _v;
|
||||
}
|
||||
|
||||
const kernel_descriptor_t*
|
||||
GetKernelCode(uint64_t kernel_object)
|
||||
{
|
||||
const kernel_descriptor_t* kernel_code = nullptr;
|
||||
if(GetLoaderTable().hsa_ven_amd_loader_query_host_address == nullptr) return kernel_code;
|
||||
hsa_status_t status = GetLoaderTable().hsa_ven_amd_loader_query_host_address(
|
||||
reinterpret_cast<const void*>(kernel_object), // NOLINT(performance-no-int-to-ptr)
|
||||
reinterpret_cast<const void**>(&kernel_code));
|
||||
if(HSA_STATUS_SUCCESS != status)
|
||||
{
|
||||
kernel_code = reinterpret_cast<kernel_descriptor_t*>( // NOLINT(performance-no-int-to-ptr)
|
||||
kernel_object);
|
||||
}
|
||||
return kernel_code;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void
|
||||
SetKernelProperties(uint64_t correlation_id, rocprofiler_tool_kernel_properties_t kernel_properties)
|
||||
{
|
||||
std::lock_guard<std::mutex> kernel_properties_correlation_map_lock(
|
||||
kernel_properties_correlation_mutex);
|
||||
kernel_properties_correlation_map[correlation_id] = std::move(kernel_properties);
|
||||
}
|
||||
|
||||
rocprofiler_tool_kernel_properties_t
|
||||
GetKernelProperties(uint64_t correlation_id)
|
||||
{
|
||||
std::lock_guard<std::mutex> kernel_properties_correlation_map_lock(
|
||||
kernel_properties_correlation_mutex);
|
||||
auto it = kernel_properties_correlation_map.find(correlation_id);
|
||||
if(it == kernel_properties_correlation_map.end())
|
||||
{
|
||||
std::cout << "kernel properties not found" << std::endl;
|
||||
abort();
|
||||
}
|
||||
return it->second;
|
||||
}
|
||||
|
||||
void
|
||||
populate_kernel_properties_data(rocprofiler_tool_kernel_properties_t* kernel_properties,
|
||||
const hsa_kernel_dispatch_packet_t* dispatch_packet)
|
||||
{
|
||||
const uint64_t kernel_object = dispatch_packet->kernel_object;
|
||||
|
||||
const kernel_descriptor_t* kernel_code = GetKernelCode(kernel_object);
|
||||
uint64_t grid_size =
|
||||
dispatch_packet->grid_size_x * dispatch_packet->grid_size_y * dispatch_packet->grid_size_z;
|
||||
if(grid_size > UINT32_MAX) abort();
|
||||
kernel_properties->grid_size = grid_size;
|
||||
uint64_t workgroup_size = dispatch_packet->workgroup_size_x *
|
||||
dispatch_packet->workgroup_size_y * dispatch_packet->workgroup_size_z;
|
||||
if(workgroup_size > UINT32_MAX) abort();
|
||||
kernel_properties->workgroup_size = (uint32_t) workgroup_size;
|
||||
kernel_properties->lds_size = dispatch_packet->group_segment_size;
|
||||
kernel_properties->scratch_size = dispatch_packet->private_segment_size;
|
||||
kernel_properties->arch_vgpr_count =
|
||||
arch_vgpr_count(kernel_properties->gpu_agent.name, *kernel_code);
|
||||
kernel_properties->accum_vgpr_count =
|
||||
accum_vgpr_count(kernel_properties->gpu_agent.name, *kernel_code);
|
||||
kernel_properties->sgpr_count = sgpr_count(kernel_properties->gpu_agent.name, *kernel_code);
|
||||
kernel_properties->wave_size =
|
||||
AMD_HSA_BITS_GET(kernel_code->kernel_code_properties,
|
||||
AMD_KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32)
|
||||
? 32
|
||||
: 64;
|
||||
kernel_properties->signal_handle = dispatch_packet->completion_signal.handle;
|
||||
}
|
||||
|
||||
rocprofiler_tool_buffer_name_info_t
|
||||
get_buffer_id_names()
|
||||
{
|
||||
|
||||
@@ -71,46 +71,6 @@
|
||||
constexpr size_t BUFFER_SIZE_BYTES = 4096;
|
||||
constexpr size_t WATERMARK = (BUFFER_SIZE_BYTES / 2);
|
||||
|
||||
// This can be different for different architecture
|
||||
// Lets follow the v1 rocprof
|
||||
// I will have a kernel id from the rocprofiler
|
||||
// address the kernel descriptor and access the information
|
||||
// This works for gfx9 but may not for Navi arch
|
||||
// Interecept the kernel symbol load build a table for kernel id
|
||||
// when kenel dispatch callback. Here is the kernel id
|
||||
// Use the kernel id
|
||||
typedef struct
|
||||
{
|
||||
uint64_t grid_size;
|
||||
uint64_t workgroup_size;
|
||||
uint64_t lds_size;
|
||||
uint64_t scratch_size;
|
||||
uint64_t arch_vgpr_count;
|
||||
uint64_t accum_vgpr_count;
|
||||
uint64_t sgpr_count;
|
||||
uint64_t wave_size;
|
||||
uint64_t signal_handle;
|
||||
uint64_t kernel_object;
|
||||
rocprofiler_queue_id_t queue_id;
|
||||
std::string kernel_name;
|
||||
rocprofiler_agent_t gpu_agent;
|
||||
uint64_t thread_id;
|
||||
uint64_t dispatch_index;
|
||||
|
||||
} rocprofiler_tool_kernel_properties_t;
|
||||
|
||||
struct kernel_descriptor_t
|
||||
{
|
||||
uint8_t reserved0[16];
|
||||
int64_t kernel_code_entry_byte_offset;
|
||||
uint8_t reserved1[20];
|
||||
uint32_t compute_pgm_rsrc3;
|
||||
uint32_t compute_pgm_rsrc1;
|
||||
uint32_t compute_pgm_rsrc2;
|
||||
uint16_t kernel_code_properties;
|
||||
uint8_t reserved2[6];
|
||||
};
|
||||
|
||||
using rocprofiler_tool_buffer_kind_names_t =
|
||||
std::unordered_map<rocprofiler_buffer_tracing_kind_t, const char*>;
|
||||
using rocprofiler_tool_buffer_kind_operation_names_t =
|
||||
@@ -135,29 +95,6 @@ struct rocprofiler_tool_callback_name_info_t
|
||||
rocprofiler_tool_callback_kind_operation_names_t operation_names = {};
|
||||
};
|
||||
|
||||
// std::vector<std::string>
|
||||
// GetCounterNames();
|
||||
|
||||
void
|
||||
SetKernelDescriptorName(rocprofiler_address_t kernel_descriptor, const char* name);
|
||||
|
||||
void
|
||||
SetKernelProperties(uint64_t correlation_id,
|
||||
rocprofiler_tool_kernel_properties_t kernel_properties);
|
||||
void
|
||||
SetKernelProperties(uint64_t correlation_id,
|
||||
rocprofiler_tool_kernel_properties_t kernel_properties);
|
||||
|
||||
rocprofiler_tool_kernel_properties_t
|
||||
GetKernelProperties(uint64_t correlation_id);
|
||||
|
||||
const char*
|
||||
GetKernelDescriptorName(rocprofiler_address_t kernel_descriptor);
|
||||
|
||||
void
|
||||
populate_kernel_properties_data(rocprofiler_tool_kernel_properties_t* kernel_properties,
|
||||
const hsa_kernel_dispatch_packet_t* dispatch_packet);
|
||||
|
||||
rocprofiler_tool_buffer_name_info_t
|
||||
get_buffer_id_names();
|
||||
|
||||
|
||||
@@ -33,6 +33,8 @@
|
||||
#include "lib/common/utility.hpp"
|
||||
|
||||
#include <rocprofiler-sdk/agent.h>
|
||||
#include <rocprofiler-sdk/callback_tracing.h>
|
||||
#include <rocprofiler-sdk/external_correlation.h>
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/internal_threading.h>
|
||||
#include <rocprofiler-sdk/marker/api_id.h>
|
||||
@@ -55,6 +57,21 @@
|
||||
namespace common = ::rocprofiler::common;
|
||||
namespace tool = ::rocprofiler::tool;
|
||||
|
||||
namespace std
|
||||
{
|
||||
template <>
|
||||
struct hash<rocprofiler_agent_id_t>
|
||||
{
|
||||
size_t operator()(rocprofiler_agent_id_t id) const { return id.handle; }
|
||||
};
|
||||
} // namespace std
|
||||
|
||||
inline bool
|
||||
operator==(rocprofiler_agent_id_t lhs, rocprofiler_agent_id_t rhs)
|
||||
{
|
||||
return (lhs.handle == rhs.handle);
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
constexpr uint32_t lds_block_size = 128 * 4;
|
||||
@@ -68,16 +85,23 @@ get_dereference(Tp* ptr)
|
||||
return *CHECK_NOTNULL(ptr);
|
||||
}
|
||||
|
||||
template <typename Tp>
|
||||
void
|
||||
add_destructor(Tp*& ptr)
|
||||
auto
|
||||
get_destructors_lock()
|
||||
{
|
||||
static auto _mutex = std::mutex{};
|
||||
auto _lk = std::unique_lock<std::mutex>{_mutex};
|
||||
return std::unique_lock<std::mutex>{_mutex};
|
||||
}
|
||||
|
||||
template <typename Tp>
|
||||
Tp*&
|
||||
add_destructor(Tp*& ptr)
|
||||
{
|
||||
auto _lk = get_destructors_lock();
|
||||
destructors->emplace_back([&ptr]() {
|
||||
delete ptr;
|
||||
ptr = nullptr;
|
||||
});
|
||||
return ptr;
|
||||
}
|
||||
|
||||
#define ADD_DESTRUCTOR(PTR) \
|
||||
@@ -155,7 +179,7 @@ get_counter_collection_file()
|
||||
"Process_Id",
|
||||
"Thread_Id",
|
||||
"Grid_Size",
|
||||
"Kernel-Name",
|
||||
"Kernel_Name",
|
||||
"Workgroup_Size",
|
||||
"LDS_Block_Size",
|
||||
"Scratch_Size",
|
||||
@@ -255,6 +279,7 @@ get_buffers()
|
||||
return _v;
|
||||
}
|
||||
|
||||
using rocprofiler_code_object_data_t = rocprofiler_callback_tracing_code_object_load_data_t;
|
||||
using rocprofiler_kernel_symbol_data_t =
|
||||
rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t;
|
||||
|
||||
@@ -274,14 +299,46 @@ struct kernel_symbol_data : rocprofiler_kernel_symbol_data_t
|
||||
std::string truncated_kernel_name = {};
|
||||
};
|
||||
|
||||
template <typename Tp>
|
||||
Tp*
|
||||
as_pointer(Tp&& _val)
|
||||
{
|
||||
return new Tp{std::forward<Tp>(_val)};
|
||||
}
|
||||
|
||||
using code_object_data_map_t = std::unordered_map<uint64_t, rocprofiler_code_object_data_t>;
|
||||
using kernel_symbol_data_map_t = std::unordered_map<rocprofiler_kernel_id_t, kernel_symbol_data>;
|
||||
auto kernel_data = common::Synchronized<kernel_symbol_data_map_t, true>{};
|
||||
using targeted_kernels_set_t = std::unordered_set<rocprofiler_kernel_id_t>;
|
||||
using counter_dimension_info_map_t =
|
||||
std::unordered_map<uint64_t, std::vector<rocprofiler_record_dimension_info_t>>;
|
||||
std::atomic<uint64_t> dispatch_index{0};
|
||||
auto counter_dimension_data = common::Synchronized<counter_dimension_info_map_t, true>{};
|
||||
auto buffered_name_info = get_buffer_id_names();
|
||||
auto callback_name_info = get_callback_id_names();
|
||||
|
||||
auto code_obj_data = common::Synchronized<code_object_data_map_t, true>{};
|
||||
auto kernel_data = common::Synchronized<kernel_symbol_data_map_t, true>{};
|
||||
auto counter_dimension_data = common::Synchronized<counter_dimension_info_map_t, true>{};
|
||||
auto target_kernels = common::Synchronized<targeted_kernels_set_t>{};
|
||||
auto dispatch_index = std::atomic<uint64_t>{0};
|
||||
auto* buffered_name_info = as_pointer(get_buffer_id_names());
|
||||
auto* callback_name_info = as_pointer(get_callback_id_names());
|
||||
|
||||
bool
|
||||
add_kernel_target(uint64_t _kern_id)
|
||||
{
|
||||
return target_kernels
|
||||
.wlock([](targeted_kernels_set_t& _targets_v,
|
||||
uint64_t _kern_id_v) { return _targets_v.emplace(_kern_id_v); },
|
||||
_kern_id)
|
||||
.second;
|
||||
}
|
||||
|
||||
bool
|
||||
is_targeted_kernel(uint64_t _kern_id)
|
||||
{
|
||||
return target_kernels.rlock(
|
||||
[](const targeted_kernels_set_t& _targets_v, uint64_t _kern_id_v) {
|
||||
return (_targets_v.count(_kern_id_v) > 0);
|
||||
},
|
||||
_kern_id);
|
||||
}
|
||||
|
||||
auto&
|
||||
get_client_ctx()
|
||||
@@ -328,15 +385,16 @@ cntrl_tracing_callback(rocprofiler_callback_tracing_record_t record,
|
||||
auto ts = rocprofiler_timestamp_t{};
|
||||
rocprofiler_get_timestamp(&ts);
|
||||
|
||||
const auto* kind_name = callback_name_info.kind_names.at(record.kind);
|
||||
const auto* kind_name = CHECK_NOTNULL(callback_name_info)->kind_names.at(record.kind);
|
||||
if(record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER)
|
||||
{
|
||||
user_data->value = ts;
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto* op_name =
|
||||
callback_name_info.operation_names.at(record.kind).at(record.operation);
|
||||
const auto* op_name = CHECK_NOTNULL(callback_name_info)
|
||||
->operation_names.at(record.kind)
|
||||
.at(record.operation);
|
||||
auto ss = std::stringstream{};
|
||||
tool::csv::marker_csv_encoder::write_row(ss,
|
||||
kind_name,
|
||||
@@ -368,7 +426,7 @@ callback_tracing_callback(rocprofiler_callback_tracing_record_t record,
|
||||
auto ts = rocprofiler_timestamp_t{};
|
||||
rocprofiler_get_timestamp(&ts);
|
||||
|
||||
const auto* kind_name = callback_name_info.kind_names.at(record.kind);
|
||||
const auto* kind_name = CHECK_NOTNULL(callback_name_info)->kind_names.at(record.kind);
|
||||
if(record.operation == ROCPROFILER_MARKER_CORE_API_ID_roctxMarkA)
|
||||
{
|
||||
if(record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT)
|
||||
@@ -460,8 +518,9 @@ callback_tracing_callback(rocprofiler_callback_tracing_record_t record,
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto* op_name =
|
||||
callback_name_info.operation_names.at(record.kind).at(record.operation);
|
||||
const auto* op_name = CHECK_NOTNULL(callback_name_info)
|
||||
->operation_names.at(record.kind)
|
||||
.at(record.operation);
|
||||
auto ss = std::stringstream{};
|
||||
tool::csv::marker_csv_encoder::write_row(ss,
|
||||
kind_name,
|
||||
@@ -481,78 +540,6 @@ callback_tracing_callback(rocprofiler_callback_tracing_record_t record,
|
||||
(void) data;
|
||||
}
|
||||
|
||||
void
|
||||
counter_record_callback(rocprofiler_queue_id_t,
|
||||
const rocprofiler_agent_id_t,
|
||||
rocprofiler_correlation_id_t correlation_id,
|
||||
uint64_t,
|
||||
void*,
|
||||
size_t record_count,
|
||||
rocprofiler_record_counter_t* record_data)
|
||||
{
|
||||
rocprofiler_tool_kernel_properties_t kernel_properties =
|
||||
GetKernelProperties(correlation_id.internal);
|
||||
std::map<const char*, uint64_t> counter_name_value;
|
||||
for(size_t count = 0; count < record_count; count++)
|
||||
{
|
||||
auto profiler_record = static_cast<rocprofiler_record_counter_t>(record_data[count]);
|
||||
rocprofiler_counter_id_t counter_id;
|
||||
rocprofiler_query_record_counter_id(profiler_record.id, &counter_id);
|
||||
rocprofiler_counter_info_v0_t version;
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_query_counter_info(
|
||||
counter_id, ROCPROFILER_COUNTER_INFO_VERSION_0, static_cast<void*>(&version)),
|
||||
"Could not query counter_id");
|
||||
const auto& dimension_pos_ss = counter_dimension_data.rlock(
|
||||
[&profiler_record](const counter_dimension_info_map_t& counter_dimension_data_v,
|
||||
uint64_t handle) {
|
||||
auto dimensions = counter_dimension_data_v.at(handle);
|
||||
size_t pos;
|
||||
auto pos_ss = std::stringstream{};
|
||||
size_t num_dim = dimensions.size();
|
||||
for(size_t idx = 0; idx != num_dim; idx++)
|
||||
{
|
||||
rocprofiler_query_record_dimension_position(
|
||||
profiler_record.id, dimensions[idx].id, &pos);
|
||||
pos_ss << dimensions[idx].name << ":" << pos;
|
||||
if(idx != num_dim - 1) pos_ss << ",";
|
||||
}
|
||||
return pos_ss;
|
||||
},
|
||||
counter_id.handle);
|
||||
auto search = counter_name_value.find(version.name);
|
||||
if(search == counter_name_value.end())
|
||||
counter_name_value.emplace(
|
||||
std::pair<const char*, uint64_t>{version.name, profiler_record.counter_value});
|
||||
else
|
||||
search->second = search->second + profiler_record.counter_value;
|
||||
}
|
||||
|
||||
for(auto itr = counter_name_value.begin(); itr != counter_name_value.end(); ++itr)
|
||||
{
|
||||
auto counter_collection_ss = std::stringstream{};
|
||||
tool::csv::counter_collection_csv_encoder::write_row(
|
||||
counter_collection_ss,
|
||||
correlation_id.internal,
|
||||
kernel_properties.dispatch_index,
|
||||
kernel_properties.gpu_agent.id.handle,
|
||||
kernel_properties.queue_id.handle,
|
||||
getpid(),
|
||||
kernel_properties.thread_id,
|
||||
kernel_properties.grid_size,
|
||||
kernel_properties.kernel_name,
|
||||
kernel_properties.workgroup_size,
|
||||
((kernel_properties.lds_size + (lds_block_size - 1)) & ~(lds_block_size - 1)),
|
||||
kernel_properties.scratch_size,
|
||||
kernel_properties.arch_vgpr_count,
|
||||
kernel_properties.sgpr_count,
|
||||
itr->first,
|
||||
itr->second);
|
||||
|
||||
get_dereference(get_counter_collection_file()) << counter_collection_ss.str();
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
code_object_tracing_callback(rocprofiler_callback_tracing_record_t record,
|
||||
rocprofiler_user_data_t* user_data,
|
||||
@@ -561,7 +548,19 @@ code_object_tracing_callback(rocprofiler_callback_tracing_record_t record,
|
||||
if(record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT &&
|
||||
record.operation == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_LOAD)
|
||||
{
|
||||
if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD)
|
||||
if(record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD)
|
||||
{
|
||||
auto* obj_data = static_cast<rocprofiler_code_object_data_t*>(record.payload);
|
||||
if(record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD)
|
||||
{
|
||||
code_obj_data.wlock(
|
||||
[](code_object_data_map_t& cdata, rocprofiler_code_object_data_t* obj_data_v) {
|
||||
cdata.emplace(obj_data_v->code_object_id, *obj_data_v);
|
||||
},
|
||||
CHECK_NOTNULL(obj_data));
|
||||
}
|
||||
}
|
||||
else if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD)
|
||||
{
|
||||
flush();
|
||||
}
|
||||
@@ -573,11 +572,50 @@ code_object_tracing_callback(rocprofiler_callback_tracing_record_t record,
|
||||
auto* sym_data = static_cast<rocprofiler_kernel_symbol_data_t*>(record.payload);
|
||||
if(record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD)
|
||||
{
|
||||
kernel_data.wlock(
|
||||
auto itr = kernel_data.wlock(
|
||||
[](kernel_symbol_data_map_t& kdata, rocprofiler_kernel_symbol_data_t* sym_data_v) {
|
||||
kdata.emplace(sym_data_v->kernel_id, kernel_symbol_data{*sym_data_v});
|
||||
return kdata.emplace(sym_data_v->kernel_id, kernel_symbol_data{*sym_data_v});
|
||||
},
|
||||
sym_data);
|
||||
CHECK_NOTNULL(sym_data));
|
||||
|
||||
LOG_IF(WARNING, !itr.second)
|
||||
<< "duplicate kernel symbol data for kernel_id=" << sym_data->kernel_id;
|
||||
|
||||
// add the kernel to the kernel_targets if
|
||||
if(itr.second)
|
||||
{
|
||||
// if kernel name is provided by user then by default all kernels in the application
|
||||
// are targeted
|
||||
if(tool::get_config().kernel_names.empty())
|
||||
{
|
||||
add_kernel_target(sym_data->kernel_id);
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto& kernel_info = itr.first->second;
|
||||
for(const auto& name : tool::get_config().kernel_names)
|
||||
{
|
||||
if(name == kernel_info.truncated_kernel_name)
|
||||
{
|
||||
add_kernel_target(itr.first->first);
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto dkernel_name = std::string_view{kernel_info.demangled_kernel_name};
|
||||
auto pos = dkernel_name.find(name);
|
||||
// if the demangled kernel name contains name and the next character is
|
||||
// '(' then mark as found
|
||||
if(pos != std::string::npos && (pos + 1) < dkernel_name.size() &&
|
||||
dkernel_name.at(pos + 1) == '(')
|
||||
{
|
||||
add_kernel_target(itr.first->first);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -622,7 +660,7 @@ buffered_tracing_callback(rocprofiler_context_id_t /*context*/,
|
||||
auto kernel_trace_ss = std::stringstream{};
|
||||
tool::csv::kernel_trace_csv_encoder::write_row(
|
||||
kernel_trace_ss,
|
||||
buffered_name_info.kind_names.at(record->kind),
|
||||
CHECK_NOTNULL(buffered_name_info)->kind_names.at(record->kind),
|
||||
record->agent_id.handle,
|
||||
record->queue_id.handle,
|
||||
record->kernel_id,
|
||||
@@ -652,8 +690,10 @@ buffered_tracing_callback(rocprofiler_context_id_t /*context*/,
|
||||
auto hsa_trace_ss = std::stringstream{};
|
||||
tool::csv::api_csv_encoder::write_row(
|
||||
hsa_trace_ss,
|
||||
buffered_name_info.kind_names.at(record->kind),
|
||||
buffered_name_info.operation_names.at(record->kind).at(record->operation),
|
||||
CHECK_NOTNULL(buffered_name_info)->kind_names.at(record->kind),
|
||||
CHECK_NOTNULL(buffered_name_info)
|
||||
->operation_names.at(record->kind)
|
||||
.at(record->operation),
|
||||
getpid(),
|
||||
record->thread_id,
|
||||
record->correlation_id.internal,
|
||||
@@ -670,8 +710,10 @@ buffered_tracing_callback(rocprofiler_context_id_t /*context*/,
|
||||
auto memory_copy_trace_ss = std::stringstream{};
|
||||
tool::csv::memory_copy_csv_encoder::write_row(
|
||||
memory_copy_trace_ss,
|
||||
buffered_name_info.kind_names.at(record->kind),
|
||||
buffered_name_info.operation_names.at(record->kind).at(record->operation),
|
||||
CHECK_NOTNULL(buffered_name_info)->kind_names.at(record->kind),
|
||||
CHECK_NOTNULL(buffered_name_info)
|
||||
->operation_names.at(record->kind)
|
||||
.at(record->operation),
|
||||
record->src_agent_id.handle,
|
||||
record->dst_agent_id.handle,
|
||||
record->correlation_id.internal,
|
||||
@@ -689,8 +731,10 @@ buffered_tracing_callback(rocprofiler_context_id_t /*context*/,
|
||||
auto hip_trace_ss = std::stringstream{};
|
||||
tool::csv::api_csv_encoder::write_row(
|
||||
hip_trace_ss,
|
||||
buffered_name_info.kind_names.at(record->kind),
|
||||
buffered_name_info.operation_names.at(record->kind).at(record->operation),
|
||||
CHECK_NOTNULL(buffered_name_info)->kind_names.at(record->kind),
|
||||
CHECK_NOTNULL(buffered_name_info)
|
||||
->operation_names.at(record->kind)
|
||||
.at(record->operation),
|
||||
getpid(),
|
||||
record->thread_id,
|
||||
record->correlation_id.internal,
|
||||
@@ -710,7 +754,7 @@ buffered_tracing_callback(rocprofiler_context_id_t /*context*/,
|
||||
|
||||
using counter_vec_t = std::vector<rocprofiler_counter_id_t>;
|
||||
using agent_counter_map_t =
|
||||
std::unordered_map<const rocprofiler_agent_t*, std::optional<rocprofiler_profile_config_id_t>>;
|
||||
std::unordered_map<rocprofiler_agent_id_t, std::optional<rocprofiler_profile_config_id_t>>;
|
||||
|
||||
rocprofiler_status_t
|
||||
dimensions_info_callback(rocprofiler_counter_id_t id,
|
||||
@@ -740,14 +784,14 @@ dimensions_info_callback(rocprofiler_counter_id_t id,
|
||||
|
||||
// this function creates a rocprofiler profile config on the first entry
|
||||
auto
|
||||
get_agent_profile(const rocprofiler_agent_t* agent)
|
||||
get_agent_profile(rocprofiler_agent_id_t agent_id)
|
||||
{
|
||||
static auto data = common::Synchronized<agent_counter_map_t>{};
|
||||
|
||||
auto profile = std::optional<rocprofiler_profile_config_id_t>{};
|
||||
data.ulock(
|
||||
[agent, &profile](const agent_counter_map_t& data_v) {
|
||||
auto itr = data_v.find(agent);
|
||||
[agent_id, &profile](const agent_counter_map_t& data_v) {
|
||||
auto itr = data_v.find(agent_id);
|
||||
if(itr != data_v.end())
|
||||
{
|
||||
profile = itr->second;
|
||||
@@ -755,11 +799,11 @@ get_agent_profile(const rocprofiler_agent_t* agent)
|
||||
}
|
||||
return false;
|
||||
},
|
||||
[agent, &profile](agent_counter_map_t& data_v) {
|
||||
[agent_id, &profile](agent_counter_map_t& data_v) {
|
||||
auto counters_v = counter_vec_t{};
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_iterate_agent_supported_counters(
|
||||
agent->id,
|
||||
agent_id,
|
||||
[](rocprofiler_agent_id_t,
|
||||
rocprofiler_counter_id_t* counters,
|
||||
size_t num_counters,
|
||||
@@ -771,15 +815,15 @@ get_agent_profile(const rocprofiler_agent_t* agent)
|
||||
counters[i], dimensions_info_callback, nullptr),
|
||||
"iterate_dimension_info");
|
||||
|
||||
rocprofiler_counter_info_v0_t version;
|
||||
rocprofiler_counter_info_v0_t info;
|
||||
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_query_counter_info(counters[i],
|
||||
ROCPROFILER_COUNTER_INFO_VERSION_0,
|
||||
static_cast<void*>(&version)),
|
||||
static_cast<void*>(&info)),
|
||||
"Could not query counter_id");
|
||||
|
||||
if(tool::get_config().counters.count(version.name) > 0)
|
||||
if(tool::get_config().counters.count(info.name) > 0)
|
||||
vec->emplace_back(counters[i]);
|
||||
}
|
||||
return ROCPROFILER_STATUS_SUCCESS;
|
||||
@@ -791,18 +835,122 @@ get_agent_profile(const rocprofiler_agent_t* agent)
|
||||
{
|
||||
auto profile_v = rocprofiler_profile_config_id_t{};
|
||||
ROCPROFILER_CALL(rocprofiler_create_profile_config(
|
||||
agent->id, counters_v.data(), counters_v.size(), &profile_v),
|
||||
agent_id, counters_v.data(), counters_v.size(), &profile_v),
|
||||
"Could not construct profile cfg");
|
||||
profile = profile_v;
|
||||
}
|
||||
|
||||
data_v.emplace(agent, profile);
|
||||
data_v.emplace(agent_id, profile);
|
||||
return true;
|
||||
});
|
||||
|
||||
return profile;
|
||||
}
|
||||
|
||||
struct counter_dispatch_data
|
||||
{
|
||||
uint64_t thread_id = 0;
|
||||
uint64_t dispatch_index = 0;
|
||||
};
|
||||
|
||||
void
|
||||
dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
|
||||
rocprofiler_profile_config_id_t* config,
|
||||
rocprofiler_user_data_t* user_data,
|
||||
void* /*callback_data_args*/)
|
||||
{
|
||||
auto kernel_id = dispatch_data.kernel_id;
|
||||
auto agent_id = dispatch_data.agent_id;
|
||||
|
||||
if(!is_targeted_kernel(kernel_id))
|
||||
{
|
||||
return;
|
||||
}
|
||||
else if(auto profile = get_agent_profile(agent_id))
|
||||
{
|
||||
*config = *profile;
|
||||
user_data->ptr = new counter_dispatch_data{.thread_id = common::get_tid(),
|
||||
.dispatch_index = ++dispatch_index};
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
counter_record_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
|
||||
rocprofiler_record_counter_t* record_data,
|
||||
size_t record_count,
|
||||
rocprofiler_user_data_t user_data,
|
||||
void* /*callback_data_args*/)
|
||||
{
|
||||
auto kernel_id = dispatch_data.kernel_id;
|
||||
const auto* cnt_dispatch_data_v = static_cast<counter_dispatch_data*>(user_data.ptr);
|
||||
const auto* kernel_info = kernel_data.rlock(
|
||||
[](const kernel_symbol_data_map_t& kdata, uint64_t kid) -> const auto* {
|
||||
return &kdata.at(kid);
|
||||
},
|
||||
kernel_id);
|
||||
|
||||
LOG_IF(FATAL, !kernel_info) << "missing kernel information for kernel_id=" << kernel_id;
|
||||
|
||||
LOG_IF(ERROR, record_count == 0) << "zero record count for kernel_id=" << kernel_id
|
||||
<< " (name=" << kernel_info->kernel_name << ")";
|
||||
|
||||
auto counter_name_value = std::map<const char*, uint64_t>{};
|
||||
for(size_t count = 0; count < record_count; count++)
|
||||
{
|
||||
auto profiler_record = static_cast<rocprofiler_record_counter_t>(record_data[count]);
|
||||
auto counter_id = rocprofiler_counter_id_t{};
|
||||
auto info = rocprofiler_counter_info_v0_t{};
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_query_record_counter_id(profiler_record.id, &counter_id),
|
||||
"query record counter id");
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_query_counter_info(
|
||||
counter_id, ROCPROFILER_COUNTER_INFO_VERSION_0, static_cast<void*>(&info)),
|
||||
"query counter info");
|
||||
|
||||
auto search = counter_name_value.find(info.name);
|
||||
if(search == counter_name_value.end())
|
||||
counter_name_value.emplace(
|
||||
std::pair<const char*, uint64_t>{info.name, profiler_record.counter_value});
|
||||
else
|
||||
search->second = search->second + profiler_record.counter_value;
|
||||
}
|
||||
|
||||
auto lds_block_size_v =
|
||||
(kernel_info->group_segment_size + (lds_block_size - 1)) & ~(lds_block_size - 1);
|
||||
|
||||
const auto& correlation_id = dispatch_data.correlation_id;
|
||||
|
||||
auto magnitude = [](rocprofiler_dim3_t dims) { return (dims.x * dims.y * dims.z); };
|
||||
|
||||
for(auto& itr : counter_name_value)
|
||||
{
|
||||
using csv_encoder = tool::csv::counter_collection_csv_encoder;
|
||||
|
||||
auto counter_collection_ss = std::stringstream{};
|
||||
csv_encoder::write_row(counter_collection_ss,
|
||||
correlation_id.internal,
|
||||
cnt_dispatch_data_v->dispatch_index,
|
||||
dispatch_data.agent_id.handle,
|
||||
dispatch_data.queue_id.handle,
|
||||
getpid(),
|
||||
cnt_dispatch_data_v->thread_id,
|
||||
magnitude(dispatch_data.grid_size),
|
||||
kernel_info->formatted_kernel_name,
|
||||
magnitude(dispatch_data.workgroup_size),
|
||||
lds_block_size_v,
|
||||
kernel_info->private_segment_size,
|
||||
kernel_info->arch_vgpr_count,
|
||||
kernel_info->sgpr_count,
|
||||
itr.first,
|
||||
itr.second);
|
||||
|
||||
get_dereference(get_counter_collection_file()) << counter_collection_ss.str();
|
||||
}
|
||||
|
||||
delete cnt_dispatch_data_v;
|
||||
}
|
||||
|
||||
rocprofiler_status_t
|
||||
list_metrics_iterate_agents(rocprofiler_agent_version_t,
|
||||
const void** agents,
|
||||
@@ -912,61 +1060,6 @@ list_metrics_iterate_agents(rocprofiler_agent_version_t,
|
||||
return ROCPROFILER_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void
|
||||
dispatch_callback(rocprofiler_queue_id_t queue_id,
|
||||
const rocprofiler_agent_t* agent,
|
||||
rocprofiler_correlation_id_t correlation_id,
|
||||
const hsa_kernel_dispatch_packet_t* dispatch_packet,
|
||||
uint64_t kernel_id,
|
||||
void* /*callback_data_args*/,
|
||||
rocprofiler_profile_config_id_t* config)
|
||||
{
|
||||
rocprofiler_tool_kernel_properties_t kernel_properties;
|
||||
const auto& kernel_info =
|
||||
kernel_data.rlock([](const kernel_symbol_data_map_t& kdata,
|
||||
uint64_t kernel_id_v) { return kdata.at(kernel_id_v); },
|
||||
kernel_id);
|
||||
auto is_targeted_kernel = [&kernel_info]() {
|
||||
// if kernel name is provided by user then by default all kernels in the application are
|
||||
// targeted
|
||||
if(tool::get_config().kernel_names.empty()) return true;
|
||||
|
||||
for(const auto& name : tool::get_config().kernel_names)
|
||||
{
|
||||
if(name == kernel_info.truncated_kernel_name)
|
||||
return true;
|
||||
else
|
||||
{
|
||||
auto dkernel_name = std::string_view{kernel_info.demangled_kernel_name};
|
||||
auto pos = dkernel_name.find(name);
|
||||
// if the demangled kernel name contains name and the next character is '(' then
|
||||
// mark as found
|
||||
if(pos != std::string::npos && (pos + 1) < dkernel_name.size() &&
|
||||
dkernel_name.at(pos + 1) == '(')
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
if(!is_targeted_kernel()) return;
|
||||
|
||||
auto profile = get_agent_profile(agent);
|
||||
|
||||
if(profile)
|
||||
{
|
||||
kernel_properties.kernel_name = kernel_info.formatted_kernel_name;
|
||||
kernel_properties.dispatch_index = ++dispatch_index;
|
||||
kernel_properties.queue_id = queue_id;
|
||||
kernel_properties.gpu_agent = *agent;
|
||||
kernel_properties.thread_id = common::get_tid();
|
||||
populate_kernel_properties_data(&kernel_properties, dispatch_packet);
|
||||
SetKernelProperties(correlation_id.internal, kernel_properties);
|
||||
|
||||
*config = *profile;
|
||||
}
|
||||
}
|
||||
|
||||
rocprofiler_client_finalize_t client_finalizer = nullptr;
|
||||
rocprofiler_client_id_t* client_identifier = nullptr;
|
||||
|
||||
@@ -1059,7 +1152,8 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
|
||||
"buffer tracing service for memory copy configure");
|
||||
}
|
||||
|
||||
if(tool::get_config().hsa_api_trace)
|
||||
if(tool::get_config().hsa_core_api_trace || tool::get_config().hsa_amd_ext_api_trace ||
|
||||
tool::get_config().hsa_image_ext_api_trace || tool::get_config().hsa_finalizer_ext_api_trace)
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(),
|
||||
buffer_size,
|
||||
@@ -1070,18 +1164,27 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
|
||||
&get_buffers().hsa_api_trace),
|
||||
"buffer creation");
|
||||
|
||||
for(auto itr : {ROCPROFILER_BUFFER_TRACING_HSA_CORE_API,
|
||||
ROCPROFILER_BUFFER_TRACING_HSA_AMD_EXT_API,
|
||||
ROCPROFILER_BUFFER_TRACING_HSA_IMAGE_EXT_API,
|
||||
ROCPROFILER_BUFFER_TRACING_HSA_FINALIZE_EXT_API})
|
||||
using optpair_t = std::pair<bool, rocprofiler_buffer_tracing_kind_t>;
|
||||
for(auto itr : {optpair_t{tool::get_config().hsa_core_api_trace,
|
||||
ROCPROFILER_BUFFER_TRACING_HSA_CORE_API},
|
||||
optpair_t{tool::get_config().hsa_core_api_trace,
|
||||
ROCPROFILER_BUFFER_TRACING_HSA_AMD_EXT_API},
|
||||
optpair_t{tool::get_config().hsa_core_api_trace,
|
||||
ROCPROFILER_BUFFER_TRACING_HSA_IMAGE_EXT_API},
|
||||
optpair_t{tool::get_config().hsa_core_api_trace,
|
||||
ROCPROFILER_BUFFER_TRACING_HSA_FINALIZE_EXT_API}})
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service(
|
||||
get_client_ctx(), itr, nullptr, 0, get_buffers().hsa_api_trace),
|
||||
"buffer tracing service for hsa api configure");
|
||||
if(itr.first)
|
||||
{
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_configure_buffer_tracing_service(
|
||||
get_client_ctx(), itr.second, nullptr, 0, get_buffers().hsa_api_trace),
|
||||
"buffer tracing service for hsa api configure");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(tool::get_config().hip_api_trace || tool::get_config().hip_compiler_api_trace)
|
||||
if(tool::get_config().hip_runtime_api_trace || tool::get_config().hip_compiler_api_trace)
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(),
|
||||
buffer_size,
|
||||
@@ -1092,7 +1195,7 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
|
||||
&get_buffers().hip_api_trace),
|
||||
"buffer creation");
|
||||
|
||||
if(tool::get_config().hip_api_trace)
|
||||
if(tool::get_config().hip_runtime_api_trace)
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service(
|
||||
get_client_ctx(),
|
||||
@@ -1187,7 +1290,8 @@ rocprofiler_configure(uint32_t version,
|
||||
uint32_t priority,
|
||||
rocprofiler_client_id_t* id)
|
||||
{
|
||||
common::init_logging("ROCPROF_LOG_LEVEL");
|
||||
auto logging_cfg = rocprofiler::common::logging_config{.install_failure_handler = true};
|
||||
common::init_logging("ROCPROF_LOG_LEVEL", logging_cfg);
|
||||
FLAGS_colorlogtostderr = true;
|
||||
|
||||
// set the client name
|
||||
@@ -1205,6 +1309,10 @@ rocprofiler_configure(uint32_t version,
|
||||
uint32_t minor = (version % 10000) / 100;
|
||||
uint32_t patch = version % 100;
|
||||
|
||||
// ensure these pointers are not leaked
|
||||
add_destructor(buffered_name_info);
|
||||
add_destructor(callback_name_info);
|
||||
|
||||
if(tool::get_config().list_metrics)
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_at_intercept_table_registration(
|
||||
|
||||
@@ -67,8 +67,9 @@ set_target_properties(rocprofiler-object-library PROPERTIES POSITION_INDEPENDENT
|
||||
add_library(rocprofiler-shared-library SHARED)
|
||||
add_library(rocprofiler::rocprofiler-shared-library ALIAS rocprofiler-shared-library)
|
||||
|
||||
target_sources(rocprofiler-shared-library
|
||||
PRIVATE $<TARGET_OBJECTS:rocprofiler::rocprofiler-object-library>)
|
||||
target_sources(
|
||||
rocprofiler-shared-library
|
||||
PRIVATE $<TARGET_OBJECTS:rocprofiler::rocprofiler-object-library> shared_library.cpp)
|
||||
target_link_libraries(
|
||||
rocprofiler-shared-library
|
||||
INTERFACE rocprofiler::rocprofiler-headers
|
||||
|
||||
@@ -48,6 +48,14 @@
|
||||
|
||||
namespace fs = rocprofiler::common::filesystem;
|
||||
|
||||
#if defined(ROCPROFILER_CI)
|
||||
# define ROCP_CI_LOG_IF(NON_CI_LEVEL, ...) LOG_IF(FATAL, __VA_ARGS__)
|
||||
# define ROCP_CI_LOG(NON_CI_LEVEL, ...) LOG(FATAL)
|
||||
#else
|
||||
# define ROCP_CI_LOG_IF(NON_CI_LEVEL, ...) LOG_IF(NON_CI_LEVEL, __VA_ARGS__)
|
||||
# define ROCP_CI_LOG(NON_CI_LEVEL, ...) LOG(NON_CI_LEVEL)
|
||||
#endif
|
||||
|
||||
namespace rocprofiler
|
||||
{
|
||||
namespace agent
|
||||
@@ -697,7 +705,7 @@ construct_agent_cache(::HsaApiTable* table)
|
||||
},
|
||||
&hsa_agents);
|
||||
|
||||
LOG_IF(FATAL, rocp_agents.size() != hsa_agents.size())
|
||||
ROCP_CI_LOG_IF(ERROR, rocp_agents.size() != hsa_agents.size())
|
||||
<< "Found " << rocp_agents.size() << " rocprofiler agents and " << hsa_agents.size()
|
||||
<< " HSA agents";
|
||||
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
|
||||
#include "lib/common/container/small_vector.hpp"
|
||||
#include "lib/common/synchronized.hpp"
|
||||
#include "lib/common/utility.hpp"
|
||||
#include "lib/rocprofiler-sdk/agent.hpp"
|
||||
#include "lib/rocprofiler-sdk/aql/helpers.hpp"
|
||||
#include "lib/rocprofiler-sdk/aql/packet_construct.hpp"
|
||||
@@ -32,6 +33,7 @@
|
||||
#include "lib/rocprofiler-sdk/hsa/queue_controller.hpp"
|
||||
#include "lib/rocprofiler-sdk/registration.hpp"
|
||||
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/rocprofiler.h>
|
||||
|
||||
namespace rocprofiler
|
||||
@@ -255,6 +257,7 @@ queue_cb(const std::shared_ptr<counter_callback_info>& info,
|
||||
const hsa::Queue& queue,
|
||||
const hsa::rocprofiler_packet& pkt,
|
||||
uint64_t kernel_id,
|
||||
rocprofiler_user_data_t* user_data,
|
||||
const hsa::Queue::queue_info_session_t::external_corr_id_map_t& extern_corr_ids,
|
||||
const context::correlation_id* correlation_id)
|
||||
{
|
||||
@@ -272,14 +275,25 @@ queue_cb(const std::shared_ptr<counter_callback_info>& info,
|
||||
}
|
||||
}
|
||||
|
||||
rocprofiler_profile_config_id_t req_profile = {.handle = 0};
|
||||
info->user_cb(queue.get_id(),
|
||||
queue.get_agent().get_rocp_agent(),
|
||||
_corr_id_v,
|
||||
&pkt.kernel_dispatch,
|
||||
kernel_id,
|
||||
info->callback_args,
|
||||
&req_profile);
|
||||
auto req_profile = rocprofiler_profile_config_id_t{.handle = 0};
|
||||
auto dispatch_data =
|
||||
common::init_public_api_struct(rocprofiler_profile_counting_dispatch_data_t{});
|
||||
|
||||
dispatch_data.kernel_id = kernel_id;
|
||||
dispatch_data.agent_id = CHECK_NOTNULL(queue.get_agent().get_rocp_agent())->id;
|
||||
dispatch_data.queue_id = queue.get_id();
|
||||
dispatch_data.correlation_id = _corr_id_v;
|
||||
dispatch_data.private_segment_size = pkt.kernel_dispatch.private_segment_size;
|
||||
dispatch_data.group_segment_size = pkt.kernel_dispatch.group_segment_size;
|
||||
dispatch_data.workgroup_size = {pkt.kernel_dispatch.workgroup_size_x,
|
||||
pkt.kernel_dispatch.workgroup_size_y,
|
||||
pkt.kernel_dispatch.workgroup_size_z};
|
||||
dispatch_data.grid_size = {pkt.kernel_dispatch.grid_size_x,
|
||||
pkt.kernel_dispatch.grid_size_y,
|
||||
pkt.kernel_dispatch.grid_size_z};
|
||||
|
||||
info->user_cb(dispatch_data, &req_profile, user_data, info->callback_args);
|
||||
|
||||
if(req_profile.handle == 0) return nullptr;
|
||||
|
||||
auto prof_config = get_controller().get_profile_cfg(req_profile);
|
||||
@@ -407,13 +421,27 @@ completed_cb(const std::shared_ptr<counter_callback_info>& info,
|
||||
if(!out.empty())
|
||||
{
|
||||
CHECK(info->record_callback);
|
||||
info->record_callback(queue.get_id(),
|
||||
queue.get_agent().get_rocp_agent()->id,
|
||||
_corr_id_v,
|
||||
session.kernel_id,
|
||||
info->record_callback_args,
|
||||
out.size(),
|
||||
out.data());
|
||||
|
||||
auto dispatch_data =
|
||||
common::init_public_api_struct(rocprofiler_profile_counting_dispatch_data_t{});
|
||||
|
||||
const auto& kernel_dispatch_pkt = session.kernel_pkt.kernel_dispatch;
|
||||
|
||||
dispatch_data.kernel_id = session.kernel_id;
|
||||
dispatch_data.agent_id = CHECK_NOTNULL(queue.get_agent().get_rocp_agent())->id;
|
||||
dispatch_data.queue_id = queue.get_id();
|
||||
dispatch_data.correlation_id = _corr_id_v;
|
||||
dispatch_data.private_segment_size = kernel_dispatch_pkt.private_segment_size;
|
||||
dispatch_data.group_segment_size = kernel_dispatch_pkt.group_segment_size;
|
||||
dispatch_data.workgroup_size = {kernel_dispatch_pkt.workgroup_size_x,
|
||||
kernel_dispatch_pkt.workgroup_size_y,
|
||||
kernel_dispatch_pkt.workgroup_size_z};
|
||||
dispatch_data.grid_size = {kernel_dispatch_pkt.grid_size_x,
|
||||
kernel_dispatch_pkt.grid_size_y,
|
||||
kernel_dispatch_pkt.grid_size_z};
|
||||
|
||||
info->record_callback(
|
||||
dispatch_data, out.data(), out.size(), session.user_data, info->record_callback_args);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -436,9 +464,11 @@ start_context(const context::context* ctx)
|
||||
[=](const hsa::Queue& q,
|
||||
const hsa::rocprofiler_packet& kern_pkt,
|
||||
uint64_t kernel_id,
|
||||
rocprofiler_user_data_t* user_data,
|
||||
const hsa::Queue::queue_info_session_t::external_corr_id_map_t& extern_corr_ids,
|
||||
const context::correlation_id* correlation_id) {
|
||||
return queue_cb(cb, q, kern_pkt, kernel_id, extern_corr_ids, correlation_id);
|
||||
return queue_cb(
|
||||
cb, q, kern_pkt, kernel_id, user_data, extern_corr_ids, correlation_id);
|
||||
},
|
||||
// Completion CB
|
||||
[=](const hsa::Queue& q,
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
|
||||
#include <rocprofiler-sdk/agent.h>
|
||||
#include <rocprofiler-sdk/dispatch_profile.h>
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
|
||||
#include "lib/rocprofiler-sdk/aql/helpers.hpp"
|
||||
#include "lib/rocprofiler-sdk/aql/packet_construct.hpp"
|
||||
@@ -80,7 +81,6 @@ struct counter_callback_info
|
||||
// HSA Queue ClientID. This is an ID we get when we insert a callback into the
|
||||
// HSA queue interceptor. This ID can be used to disable the callback.
|
||||
rocprofiler::hsa::ClientID queue_id{-1};
|
||||
|
||||
// Buffer to use for storing counter data. Used if callback is not set.
|
||||
std::optional<rocprofiler_buffer_id_t> buffer;
|
||||
|
||||
@@ -130,6 +130,7 @@ queue_cb(const std::shared_ptr<counter_callback_info>& info,
|
||||
const hsa::Queue& queue,
|
||||
const hsa::rocprofiler_packet& pkt,
|
||||
uint64_t kernel_id,
|
||||
rocprofiler_user_data_t* user_data,
|
||||
const hsa::Queue::queue_info_session_t::external_corr_id_map_t& extern_corr_ids,
|
||||
const context::correlation_id* correlation_id);
|
||||
|
||||
|
||||
@@ -20,10 +20,23 @@
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <sstream>
|
||||
#include <tuple>
|
||||
#include "lib/rocprofiler-sdk/counters/core.hpp"
|
||||
#include "lib/common/static_object.hpp"
|
||||
#include "lib/common/utility.hpp"
|
||||
#include "lib/rocprofiler-sdk/agent.hpp"
|
||||
#include "lib/rocprofiler-sdk/buffer.hpp"
|
||||
#include "lib/rocprofiler-sdk/context/context.hpp"
|
||||
#include "lib/rocprofiler-sdk/counters/id_decode.hpp"
|
||||
#include "lib/rocprofiler-sdk/counters/metrics.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/agent_cache.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/queue.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/queue_controller.hpp"
|
||||
#include "lib/rocprofiler-sdk/registration.hpp"
|
||||
|
||||
#include <rocprofiler-sdk/dispatch_profile.h>
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/registration.h>
|
||||
#include <rocprofiler-sdk/rocprofiler.h>
|
||||
|
||||
#include <fmt/core.h>
|
||||
#include <gtest/gtest.h>
|
||||
@@ -31,21 +44,10 @@
|
||||
#include <hsa/hsa_api_trace.h>
|
||||
#include <hsa/hsa_ext_amd.h>
|
||||
|
||||
#include <rocprofiler-sdk/rocprofiler.h>
|
||||
|
||||
#include "lib/common/static_object.hpp"
|
||||
#include "lib/common/utility.hpp"
|
||||
#include "lib/rocprofiler-sdk/agent.hpp"
|
||||
#include "lib/rocprofiler-sdk/buffer.hpp"
|
||||
#include "lib/rocprofiler-sdk/context/context.hpp"
|
||||
#include "lib/rocprofiler-sdk/counters/core.hpp"
|
||||
#include "lib/rocprofiler-sdk/counters/id_decode.hpp"
|
||||
#include "lib/rocprofiler-sdk/counters/metrics.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/agent_cache.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/queue.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/queue_controller.hpp"
|
||||
#include "lib/rocprofiler-sdk/registration.hpp"
|
||||
#include "rocprofiler-sdk/registration.h"
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <sstream>
|
||||
#include <tuple>
|
||||
|
||||
using namespace rocprofiler::counters;
|
||||
using namespace rocprofiler;
|
||||
@@ -204,13 +206,10 @@ buffered_callback(rocprofiler_context_id_t,
|
||||
}
|
||||
|
||||
void
|
||||
null_dispatch_callback(rocprofiler_queue_id_t,
|
||||
const rocprofiler_agent_t*,
|
||||
rocprofiler_correlation_id_t,
|
||||
const hsa_kernel_dispatch_packet_t*,
|
||||
uint64_t,
|
||||
void*,
|
||||
rocprofiler_profile_config_id_t*)
|
||||
null_dispatch_callback(rocprofiler_profile_counting_dispatch_data_t,
|
||||
rocprofiler_profile_config_id_t*,
|
||||
rocprofiler_user_data_t*,
|
||||
void*)
|
||||
{}
|
||||
|
||||
void
|
||||
@@ -223,13 +222,11 @@ null_buffered_callback(rocprofiler_context_id_t,
|
||||
{}
|
||||
|
||||
void
|
||||
null_record_callback(rocprofiler_queue_id_t,
|
||||
rocprofiler_agent_id_t,
|
||||
rocprofiler_correlation_id_t,
|
||||
uint64_t,
|
||||
void*,
|
||||
null_record_callback(rocprofiler_profile_counting_dispatch_data_t,
|
||||
rocprofiler_record_counter_t*,
|
||||
size_t,
|
||||
rocprofiler_record_counter_t*)
|
||||
rocprofiler_user_data_t,
|
||||
void*)
|
||||
{}
|
||||
|
||||
} // namespace
|
||||
@@ -326,10 +323,10 @@ public:
|
||||
, _agent(a)
|
||||
, _id(id)
|
||||
{}
|
||||
virtual const AgentCache& get_agent() const override final { return _agent; };
|
||||
virtual rocprofiler_queue_id_t get_id() const override final { return _id; };
|
||||
const AgentCache& get_agent() const final { return _agent; };
|
||||
rocprofiler_queue_id_t get_id() const final { return _id; };
|
||||
|
||||
~FakeQueue() {}
|
||||
~FakeQueue() override = default;
|
||||
|
||||
private:
|
||||
const AgentCache& _agent;
|
||||
@@ -339,39 +336,60 @@ private:
|
||||
} // namespace hsa
|
||||
} // namespace rocprofiler
|
||||
|
||||
bool
|
||||
operator==(rocprofiler_dim3_t lhs, rocprofiler_dim3_t rhs)
|
||||
{
|
||||
return std::tie(lhs.x, lhs.y, lhs.z) == std::tie(rhs.x, rhs.y, rhs.z);
|
||||
}
|
||||
|
||||
bool
|
||||
operator==(rocprofiler_agent_id_t lhs, rocprofiler_agent_id_t rhs)
|
||||
{
|
||||
return (lhs.handle == rhs.handle);
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
struct expected_dispatch
|
||||
{
|
||||
// To pass back
|
||||
rocprofiler_profile_config_id_t id;
|
||||
|
||||
rocprofiler_queue_id_t queue_id;
|
||||
const rocprofiler_agent_t* agent;
|
||||
rocprofiler_correlation_id_t correlation_id;
|
||||
hsa_kernel_dispatch_packet_t* dispatch_packet;
|
||||
uint64_t kernel_id;
|
||||
rocprofiler_profile_config_id_t* config;
|
||||
rocprofiler_profile_config_id_t id = {};
|
||||
rocprofiler_queue_id_t queue_id = {.handle = 0};
|
||||
rocprofiler_agent_id_t agent_id = {.handle = 0};
|
||||
uint64_t kernel_id = 0;
|
||||
rocprofiler_correlation_id_t correlation_id = {.internal = 0, .external = {.value = 0}};
|
||||
rocprofiler_dim3_t workgroup_size = {0, 0, 0};
|
||||
rocprofiler_dim3_t grid_size = {0, 0, 0};
|
||||
rocprofiler_profile_config_id_t* config = nullptr;
|
||||
};
|
||||
|
||||
void
|
||||
user_dispatch_cb(rocprofiler_queue_id_t queue_id,
|
||||
const rocprofiler_agent_t* agent,
|
||||
rocprofiler_correlation_id_t correlation_id,
|
||||
const hsa_kernel_dispatch_packet_t* dispatch_packet,
|
||||
uint64_t kernel_id,
|
||||
void* callback_data_args,
|
||||
rocprofiler_profile_config_id_t* config)
|
||||
user_dispatch_cb(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
|
||||
rocprofiler_profile_config_id_t* config,
|
||||
rocprofiler_user_data_t* user_data,
|
||||
void* callback_data_args)
|
||||
{
|
||||
expected_dispatch& expected = *static_cast<expected_dispatch*>(callback_data_args);
|
||||
ASSERT_EQ(expected.agent, agent);
|
||||
ASSERT_EQ(expected.queue_id.handle, queue_id.handle);
|
||||
ASSERT_EQ(expected.correlation_id.internal, correlation_id.internal);
|
||||
ASSERT_EQ(expected.correlation_id.external.ptr, correlation_id.external.ptr);
|
||||
ASSERT_EQ(expected.correlation_id.external.value, correlation_id.external.value);
|
||||
ASSERT_EQ(expected.dispatch_packet, dispatch_packet);
|
||||
ASSERT_EQ(expected.kernel_id, kernel_id);
|
||||
|
||||
auto agent_id = dispatch_data.agent_id;
|
||||
auto queue_id = dispatch_data.queue_id;
|
||||
auto correlation_id = dispatch_data.correlation_id;
|
||||
auto kernel_id = dispatch_data.kernel_id;
|
||||
|
||||
EXPECT_EQ(sizeof(rocprofiler_profile_counting_dispatch_data_t), dispatch_data.size);
|
||||
EXPECT_EQ(expected.kernel_id, kernel_id);
|
||||
EXPECT_EQ(expected.agent_id, agent_id);
|
||||
EXPECT_EQ(expected.queue_id.handle, queue_id.handle);
|
||||
EXPECT_EQ(expected.correlation_id.internal, correlation_id.internal);
|
||||
EXPECT_EQ(expected.correlation_id.external.ptr, correlation_id.external.ptr);
|
||||
EXPECT_EQ(expected.correlation_id.external.value, correlation_id.external.value);
|
||||
EXPECT_EQ(expected.workgroup_size, dispatch_data.workgroup_size);
|
||||
EXPECT_EQ(expected.grid_size, dispatch_data.grid_size);
|
||||
|
||||
ASSERT_NE(config, nullptr);
|
||||
config->handle = expected.id.handle;
|
||||
|
||||
(void) user_data;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
@@ -440,17 +458,22 @@ TEST(core, check_callbacks)
|
||||
hsa::rocprofiler_packet pkt;
|
||||
pkt.ext_amd_aql_pm4.header = count++;
|
||||
|
||||
expected.correlation_id = {.internal = corr_id.internal,
|
||||
expected.correlation_id = {.internal = corr_id.internal,
|
||||
.external = context::null_user_data};
|
||||
expected.dispatch_packet = &pkt.kernel_dispatch;
|
||||
expected.kernel_id = count++;
|
||||
expected.queue_id = qid;
|
||||
expected.agent = fq.get_agent().get_rocp_agent();
|
||||
expected.workgroup_size = {pkt.kernel_dispatch.workgroup_size_x,
|
||||
pkt.kernel_dispatch.workgroup_size_y,
|
||||
pkt.kernel_dispatch.workgroup_size_z};
|
||||
expected.grid_size = {pkt.kernel_dispatch.grid_size_x,
|
||||
pkt.kernel_dispatch.grid_size_y,
|
||||
pkt.kernel_dispatch.grid_size_z};
|
||||
expected.kernel_id = count++;
|
||||
expected.queue_id = qid;
|
||||
expected.agent_id = fq.get_agent().get_rocp_agent()->id;
|
||||
|
||||
hsa::Queue::queue_info_session_t::external_corr_id_map_t extern_ids = {};
|
||||
|
||||
auto ret_pkt =
|
||||
counters::queue_cb(cb_info, fq, pkt, expected.kernel_id, extern_ids, &corr_id);
|
||||
auto user_data = rocprofiler_user_data_t{.value = corr_id.internal};
|
||||
auto ret_pkt = counters::queue_cb(
|
||||
cb_info, fq, pkt, expected.kernel_id, &user_data, extern_ids, &corr_id);
|
||||
|
||||
ASSERT_TRUE(ret_pkt) << fmt::format("Expected a packet to be generated for - {}",
|
||||
metric.name());
|
||||
|
||||
@@ -20,14 +20,6 @@
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <fmt/core.h>
|
||||
#include <hsa/hsa.h>
|
||||
#include <hsa/hsa_api_trace.h>
|
||||
#include <hsa/hsa_ext_amd.h>
|
||||
#include <rocprofiler-sdk/rocprofiler.h>
|
||||
|
||||
#include "lib/common/static_object.hpp"
|
||||
#include "lib/common/utility.hpp"
|
||||
#include "lib/rocprofiler-sdk/agent.hpp"
|
||||
@@ -42,7 +34,15 @@
|
||||
#include "lib/rocprofiler-sdk/hsa/queue.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/queue_controller.hpp"
|
||||
#include "lib/rocprofiler-sdk/registration.hpp"
|
||||
#include "rocprofiler-sdk/registration.h"
|
||||
|
||||
#include <rocprofiler-sdk/registration.h>
|
||||
#include <rocprofiler-sdk/rocprofiler.h>
|
||||
|
||||
#include <fmt/core.h>
|
||||
#include <gtest/gtest.h>
|
||||
#include <hsa/hsa.h>
|
||||
#include <hsa/hsa_api_trace.h>
|
||||
#include <hsa/hsa_ext_amd.h>
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
@@ -20,16 +20,6 @@
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <sstream>
|
||||
#include <tuple>
|
||||
|
||||
#include <fmt/core.h>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <rocprofiler-sdk/rocprofiler.h>
|
||||
|
||||
#include "lib/common/static_object.hpp"
|
||||
#include "lib/common/utility.hpp"
|
||||
#include "lib/rocprofiler-sdk/buffer.hpp"
|
||||
@@ -37,7 +27,18 @@
|
||||
#include "lib/rocprofiler-sdk/counters/id_decode.hpp"
|
||||
#include "lib/rocprofiler-sdk/counters/metrics.hpp"
|
||||
#include "lib/rocprofiler-sdk/registration.hpp"
|
||||
#include "rocprofiler-sdk/registration.h"
|
||||
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/registration.h>
|
||||
#include <rocprofiler-sdk/rocprofiler.h>
|
||||
|
||||
#include <fmt/core.h>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <sstream>
|
||||
#include <tuple>
|
||||
|
||||
using namespace rocprofiler::counters;
|
||||
|
||||
@@ -125,13 +126,10 @@ buffered_callback(rocprofiler_context_id_t,
|
||||
{}
|
||||
|
||||
void
|
||||
dispatch_callback(rocprofiler_queue_id_t,
|
||||
const rocprofiler_agent_t*,
|
||||
rocprofiler_correlation_id_t,
|
||||
const hsa_kernel_dispatch_packet_t*,
|
||||
uint64_t,
|
||||
void*,
|
||||
rocprofiler_profile_config_id_t*)
|
||||
dispatch_callback(rocprofiler_profile_counting_dispatch_data_t,
|
||||
rocprofiler_profile_config_id_t*,
|
||||
rocprofiler_user_data_t*,
|
||||
void*)
|
||||
{}
|
||||
|
||||
rocprofiler_context_id_t&
|
||||
|
||||
@@ -166,12 +166,131 @@ get_names()
|
||||
|
||||
namespace
|
||||
{
|
||||
using hsa_loader_table_t = hsa_ven_amd_loader_1_01_pfn_t;
|
||||
using context_t = context::context;
|
||||
using user_data_t = rocprofiler_user_data_t;
|
||||
using context_array_t = context::context_array_t;
|
||||
using context_user_data_map_t = std::unordered_map<const context_t*, user_data_t>;
|
||||
using name_array_t = std::vector<std::pair<size_t, std::unique_ptr<std::string>>>;
|
||||
using hsa_loader_table_t = hsa_ven_amd_loader_1_01_pfn_t;
|
||||
using context_t = context::context;
|
||||
using user_data_t = rocprofiler_user_data_t;
|
||||
using context_array_t = context::context_array_t;
|
||||
using context_user_data_map_t = std::unordered_map<const context_t*, user_data_t>;
|
||||
using name_array_t = std::vector<std::pair<size_t, std::unique_ptr<std::string>>>;
|
||||
using amd_compute_pgm_rsrc_three32_t = uint32_t;
|
||||
|
||||
struct kernel_descriptor_t
|
||||
{
|
||||
uint8_t reserved0[16];
|
||||
int64_t kernel_code_entry_byte_offset;
|
||||
uint8_t reserved1[20];
|
||||
uint32_t compute_pgm_rsrc3;
|
||||
uint32_t compute_pgm_rsrc1;
|
||||
uint32_t compute_pgm_rsrc2;
|
||||
uint16_t kernel_code_properties;
|
||||
uint8_t reserved2[6];
|
||||
};
|
||||
|
||||
// AMD Compute Program Resource Register Three.
|
||||
enum amd_compute_gfx9_pgm_rsrc_three_t
|
||||
{
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_ACCUM_OFFSET, 0, 5),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_TG_SPLIT, 16, 1)
|
||||
};
|
||||
|
||||
enum amd_compute_gfx10_gfx11_pgm_rsrc_three_t
|
||||
{
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_SHARED_VGPR_COUNT, 0, 4),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_INST_PREF_SIZE, 4, 6),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_TRAP_ON_START, 10, 1),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_TRAP_ON_END, 11, 1),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_IMAGE_OP, 31, 1)
|
||||
};
|
||||
|
||||
// Kernel code properties.
|
||||
enum amd_kernel_code_property_t
|
||||
{
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER,
|
||||
0,
|
||||
1),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, 1, 1),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, 2, 1),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR,
|
||||
3,
|
||||
1),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, 4, 1),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
|
||||
6,
|
||||
1),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_RESERVED0, 7, 3),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
|
||||
10,
|
||||
1), // GFX10+
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK, 11, 1),
|
||||
AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_RESERVED1, 12, 4),
|
||||
};
|
||||
|
||||
uint32_t
|
||||
arch_vgpr_count(std::string_view name, kernel_descriptor_t kernel_code)
|
||||
{
|
||||
if(name == "gfx90a" || name.find("gfx94") == 0)
|
||||
return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc3,
|
||||
AMD_COMPUTE_PGM_RSRC_THREE_ACCUM_OFFSET) +
|
||||
1) *
|
||||
4;
|
||||
|
||||
return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1,
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT) +
|
||||
1) *
|
||||
(AMD_HSA_BITS_GET(kernel_code.kernel_code_properties,
|
||||
AMD_KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32)
|
||||
? 8
|
||||
: 4);
|
||||
}
|
||||
|
||||
uint32_t
|
||||
accum_vgpr_count(std::string_view name, kernel_descriptor_t kernel_code)
|
||||
{
|
||||
if(name == "gfx908")
|
||||
return arch_vgpr_count(name, kernel_code);
|
||||
else if(name == "gfx90a" || name.find("gfx94") == 0)
|
||||
return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1,
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT) +
|
||||
1) *
|
||||
(8 - arch_vgpr_count(name, kernel_code));
|
||||
|
||||
LOG(WARNING) << "Missing support for accum_vgpr_count for " << name;
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint32_t
|
||||
sgpr_count(std::string_view name, kernel_descriptor_t kernel_code)
|
||||
{
|
||||
// GFX10 and later always allocate 128 sgprs.
|
||||
constexpr uint32_t gfx10_sgprs = 128;
|
||||
|
||||
auto begp = name.find_first_of("0123456789");
|
||||
if(!name.empty() && begp != std::string_view::npos)
|
||||
{
|
||||
auto endp = name.find_first_not_of("0123456789", begp);
|
||||
auto lenp = (endp - begp) + 1;
|
||||
auto gfxip_str = name.substr(begp, lenp);
|
||||
auto gfxip_n = int32_t{0};
|
||||
if(!gfxip_str.empty()) gfxip_n = std::stoi(std::string{gfxip_str});
|
||||
|
||||
if(gfxip_n >= 1000)
|
||||
{
|
||||
return gfx10_sgprs;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1,
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT) /
|
||||
2 +
|
||||
1) *
|
||||
16;
|
||||
}
|
||||
}
|
||||
|
||||
LOG(WARNING) << "Missing support for sgpr_count for " << name;
|
||||
return 0;
|
||||
}
|
||||
|
||||
name_array_t*
|
||||
get_string_array()
|
||||
@@ -212,6 +331,41 @@ get_loader_table()
|
||||
return _v;
|
||||
}
|
||||
|
||||
auto*&
|
||||
get_status_string_function()
|
||||
{
|
||||
static decltype(::hsa_status_string)* _v = nullptr;
|
||||
return _v;
|
||||
}
|
||||
|
||||
std::string_view
|
||||
get_status_string(hsa_status_t _status)
|
||||
{
|
||||
const char* _msg = nullptr;
|
||||
if(get_status_string_function() &&
|
||||
get_status_string_function()(_status, &_msg) == HSA_STATUS_SUCCESS && _msg)
|
||||
return std::string_view{_msg};
|
||||
|
||||
return std::string_view{"(unknown HSA error)"};
|
||||
}
|
||||
|
||||
const kernel_descriptor_t*
|
||||
get_kernel_descriptor(uint64_t kernel_object)
|
||||
{
|
||||
const kernel_descriptor_t* kernel_code = nullptr;
|
||||
if(get_loader_table().hsa_ven_amd_loader_query_host_address == nullptr) return kernel_code;
|
||||
hsa_status_t status = get_loader_table().hsa_ven_amd_loader_query_host_address(
|
||||
reinterpret_cast<const void*>(kernel_object), // NOLINT(performance-no-int-to-ptr)
|
||||
reinterpret_cast<const void**>(&kernel_code));
|
||||
if(status == HSA_STATUS_SUCCESS) return kernel_code;
|
||||
|
||||
LOG(WARNING) << "hsa_ven_amd_loader_query_host_address(kernel_object=" << kernel_object
|
||||
<< ") returned " << status << ": " << get_status_string(status);
|
||||
|
||||
// NOLINTNEXTLINE(performance-no-int-to-ptr)
|
||||
return reinterpret_cast<kernel_descriptor_t*>(kernel_object);
|
||||
}
|
||||
|
||||
struct kernel_symbol
|
||||
{
|
||||
using kernel_symbol_data_t =
|
||||
@@ -441,6 +595,19 @@ executable_iterate_agent_symbols_load_callback(hsa_executable_t executabl
|
||||
ROCP_HSA_CORE_GET_EXE_SYMBOL_INFO(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
|
||||
&data.private_segment_size);
|
||||
|
||||
// This works for gfx9 but may not for Navi arch
|
||||
const auto* kernel_descript = get_kernel_descriptor(data.kernel_object);
|
||||
if(CHECK_NOTNULL(code_obj_v) && CHECK_NOTNULL(kernel_descript))
|
||||
{
|
||||
const auto* rocp_agent = agent::get_agent(code_obj_v->rocp_data.rocp_agent);
|
||||
if(CHECK_NOTNULL(rocp_agent))
|
||||
{
|
||||
data.arch_vgpr_count = arch_vgpr_count(rocp_agent->name, *kernel_descript);
|
||||
data.accum_vgpr_count = accum_vgpr_count(rocp_agent->name, *kernel_descript);
|
||||
data.sgpr_count = sgpr_count(rocp_agent->name, *kernel_descript);
|
||||
}
|
||||
}
|
||||
|
||||
// if we have reached this point (i.e. there were no HSA errors returned within macro) then we
|
||||
// generate a unique kernel symbol id
|
||||
data.kernel_id = ++get_kernel_symbol_id();
|
||||
@@ -905,10 +1072,13 @@ code_object_init(HsaApiTable* table)
|
||||
{
|
||||
auto& core_table = *table->core_;
|
||||
|
||||
get_status_string_function() = core_table.hsa_status_string_fn;
|
||||
|
||||
auto _status = core_table.hsa_system_get_major_extension_table_fn(
|
||||
HSA_EXTENSION_AMD_LOADER, 1, sizeof(hsa_loader_table_t), &get_loader_table());
|
||||
|
||||
LOG_IF(ERROR, _status != HSA_STATUS_SUCCESS) << "hsa_system_get_major_extension_table failed";
|
||||
LOG_IF(ERROR, _status != HSA_STATUS_SUCCESS)
|
||||
<< "hsa_system_get_major_extension_table failed: " << get_status_string(_status);
|
||||
|
||||
if(_status == HSA_STATUS_SUCCESS)
|
||||
{
|
||||
|
||||
@@ -253,8 +253,9 @@ WriteInterceptor(const void* packets,
|
||||
return;
|
||||
}
|
||||
|
||||
auto thr_id = common::get_tid();
|
||||
auto* corr_id = context::get_latest_correlation_id();
|
||||
auto thr_id = common::get_tid();
|
||||
auto* corr_id = context::get_latest_correlation_id();
|
||||
auto user_data = rocprofiler_user_data_t{.value = 0};
|
||||
|
||||
// use thread-local value to reuse allocation
|
||||
auto extern_corr_ids = Queue::queue_info_session_t::external_corr_id_map_t{};
|
||||
@@ -300,8 +301,8 @@ WriteInterceptor(const void* packets,
|
||||
queue.signal_callback([&](const auto& map) {
|
||||
for(const auto& [client_id, cb_pair] : map)
|
||||
{
|
||||
if(auto maybe_pkt =
|
||||
cb_pair.first(queue, kernel_pkt, kernel_id, extern_corr_ids, corr_id))
|
||||
if(auto maybe_pkt = cb_pair.first(
|
||||
queue, kernel_pkt, kernel_id, &user_data, extern_corr_ids, corr_id))
|
||||
{
|
||||
inst_pkt.push_back(std::make_pair(std::move(maybe_pkt), client_id));
|
||||
}
|
||||
@@ -381,6 +382,7 @@ WriteInterceptor(const void* packets,
|
||||
.tid = thr_id,
|
||||
.kernel_id = kernel_id,
|
||||
.queue_id = queue.get_id(),
|
||||
.user_data = user_data,
|
||||
.hsa_agent = queue.get_agent().get_hsa_agent(),
|
||||
.rocp_agent = queue.get_agent().get_rocp_agent(),
|
||||
.correlation_id = corr_id,
|
||||
|
||||
@@ -122,6 +122,7 @@ public:
|
||||
rocprofiler_thread_id_t tid = common::get_tid();
|
||||
rocprofiler_kernel_id_t kernel_id = 0;
|
||||
rocprofiler_queue_id_t queue_id = {};
|
||||
rocprofiler_user_data_t user_data = {.value = 0};
|
||||
hsa_agent_t hsa_agent = {};
|
||||
const rocprofiler_agent_t* rocp_agent = nullptr;
|
||||
context::correlation_id* correlation_id = nullptr;
|
||||
@@ -137,6 +138,7 @@ public:
|
||||
const Queue&,
|
||||
const rocprofiler_packet&,
|
||||
uint64_t,
|
||||
rocprofiler_user_data_t*,
|
||||
const queue_info_session_t::external_corr_id_map_t&,
|
||||
const context::correlation_id*)>;
|
||||
// Signals the completion of the kernel packet.
|
||||
@@ -220,4 +222,4 @@ Queue::signal_callback(FuncT&& func) const
|
||||
}
|
||||
|
||||
} // namespace hsa
|
||||
} // namespace rocprofiler
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include <glog/logging.h>
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/rocprofiler.h>
|
||||
|
||||
#include "lib/common/synchronized.hpp"
|
||||
@@ -31,7 +31,8 @@
|
||||
#include "lib/rocprofiler-sdk/counters/evaluate_ast.hpp"
|
||||
#include "lib/rocprofiler-sdk/counters/metrics.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/agent_cache.hpp"
|
||||
#include "rocprofiler-sdk/fwd.h"
|
||||
|
||||
#include <glog/logging.h>
|
||||
|
||||
extern "C" {
|
||||
/**
|
||||
|
||||
@@ -235,14 +235,15 @@ find_clients()
|
||||
{
|
||||
for(const auto& itr : env)
|
||||
{
|
||||
LOG(INFO) << "searching " << itr << " for rocprofiler_configure";
|
||||
LOG(INFO) << "[env] searching " << itr << " for rocprofiler_configure";
|
||||
|
||||
void* handle = dlopen(itr.c_str(), RTLD_NOLOAD | RTLD_LAZY);
|
||||
|
||||
if(!handle)
|
||||
{
|
||||
LOG(INFO) << itr << " is not already loaded, doing a global lazy dlopen...";
|
||||
handle = dlopen(itr.c_str(), RTLD_GLOBAL | RTLD_LAZY);
|
||||
LOG(WARNING) << "[env] " << itr
|
||||
<< " is not already loaded, doing a local lazy dlopen...";
|
||||
handle = dlopen(itr.c_str(), RTLD_LOCAL | RTLD_LAZY);
|
||||
}
|
||||
|
||||
if(!handle)
|
||||
|
||||
@@ -0,0 +1,84 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "lib/common/environment.hpp"
|
||||
#include "lib/common/logging.hpp"
|
||||
#include "lib/common/static_object.hpp"
|
||||
#include "lib/rocprofiler-sdk/allocator.hpp"
|
||||
#include "lib/rocprofiler-sdk/registration.hpp"
|
||||
|
||||
#include <iostream>
|
||||
|
||||
namespace rocprofiler
|
||||
{
|
||||
namespace shared_library
|
||||
{
|
||||
namespace
|
||||
{
|
||||
struct lifetime
|
||||
{
|
||||
lifetime();
|
||||
~lifetime();
|
||||
};
|
||||
|
||||
lifetime::lifetime()
|
||||
{
|
||||
registration::init_logging();
|
||||
|
||||
if(common::get_env("ROCPROFILER_LIBRARY_CTOR", false))
|
||||
{
|
||||
LOG(INFO) << "Initializing rocprofiler-sdk library...";
|
||||
registration::initialize();
|
||||
LOG(INFO) << "rocprofiler-sdk library initialized";
|
||||
}
|
||||
}
|
||||
|
||||
lifetime::~lifetime()
|
||||
{
|
||||
if(common::get_env("ROCPROFILER_LIBRARY_DTOR", false))
|
||||
{
|
||||
LOG(INFO) << "Finalizing rocprofiler-sdk library...";
|
||||
registration::finalize();
|
||||
LOG(INFO) << "rocprofiler-sdk library finalized";
|
||||
}
|
||||
}
|
||||
|
||||
auto*&
|
||||
get_lifetime()
|
||||
{
|
||||
static auto* _v = common::static_object<lifetime>::construct();
|
||||
return _v;
|
||||
}
|
||||
} // namespace
|
||||
} // namespace shared_library
|
||||
|
||||
auto rocprofiler_sdk_shlib_lifetime = shared_library::get_lifetime();
|
||||
|
||||
void
|
||||
rocprofiler_sdk_shlib_ctor() ROCPROFILER_ATTRIBUTE(constructor(101));
|
||||
|
||||
void
|
||||
rocprofiler_sdk_shlib_ctor()
|
||||
{
|
||||
(void) shared_library::get_lifetime();
|
||||
}
|
||||
} // namespace rocprofiler
|
||||
@@ -22,11 +22,11 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/registration.h>
|
||||
#include <rocprofiler-sdk/rocprofiler.h>
|
||||
|
||||
#include "lib/common/defines.hpp"
|
||||
#include "rocprofiler-sdk/fwd.h"
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
|
||||
@@ -20,10 +20,14 @@
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include <hsa/hsa.h>
|
||||
#include <rocprofiler-sdk-roctx/api_trace.h>
|
||||
#include <rocprofiler-sdk-roctx/roctx.h>
|
||||
#include <rocprofiler-sdk-roctx/types.h>
|
||||
#include <rocprofiler-sdk/buffer.h>
|
||||
#include <rocprofiler-sdk/callback_tracing.h>
|
||||
#include <rocprofiler-sdk/context.h>
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/marker/api_id.h>
|
||||
#include <rocprofiler-sdk/registration.h>
|
||||
#include <rocprofiler-sdk/rocprofiler.h>
|
||||
|
||||
@@ -33,13 +37,9 @@
|
||||
#include "lib/common/units.hpp"
|
||||
#include "lib/common/utility.hpp"
|
||||
#include "lib/rocprofiler-sdk/tests/common.hpp"
|
||||
#include "rocprofiler-sdk-roctx/api_trace.h"
|
||||
#include "rocprofiler-sdk-roctx/types.h"
|
||||
#include "rocprofiler-sdk/callback_tracing.h"
|
||||
#include "rocprofiler-sdk/context.h"
|
||||
#include "rocprofiler-sdk/marker/api_id.h"
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <hsa/hsa.h>
|
||||
|
||||
#include <dlfcn.h>
|
||||
#include <pthread.h>
|
||||
|
||||
@@ -40,8 +40,11 @@ add_subdirectory(common)
|
||||
# tool libraries used for data collection during integration tests
|
||||
add_subdirectory(tools)
|
||||
|
||||
# libraries used by integration test applications
|
||||
add_subdirectory(lib)
|
||||
|
||||
# applications used by integration tests
|
||||
add_subdirectory(apps)
|
||||
add_subdirectory(bin)
|
||||
|
||||
# validation tests
|
||||
add_subdirectory(kernel-tracing)
|
||||
|
||||
@@ -29,7 +29,8 @@ set(async-copy-tracing-env
|
||||
set_tests_properties(
|
||||
test-async-copy-tracing-execute
|
||||
PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT
|
||||
"${async-copy-tracing-env}" FAIL_REGULAR_EXPRESSION "threw an exception")
|
||||
"${async-copy-tracing-env}" FAIL_REGULAR_EXPRESSION
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
|
||||
|
||||
foreach(FILENAME validate.py pytest.ini conftest.py)
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}
|
||||
@@ -44,4 +45,4 @@ set_tests_properties(
|
||||
test-async-copy-tracing-validate
|
||||
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS
|
||||
test-async-copy-tracing-execute FAIL_REGULAR_EXPRESSION
|
||||
"threw an exception")
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
|
||||
|
||||
+2
-1
@@ -3,7 +3,7 @@
|
||||
#
|
||||
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-test-apps LANGUAGES C CXX)
|
||||
project(rocprofiler-tests-bin LANGUAGES C CXX)
|
||||
|
||||
set(CMAKE_BUILD_RPATH "\$ORIGIN:\$ORIGIN/../lib")
|
||||
|
||||
@@ -11,6 +11,7 @@ set(CMAKE_BUILD_RPATH "\$ORIGIN:\$ORIGIN/../lib")
|
||||
add_subdirectory(simple-transpose)
|
||||
add_subdirectory(multistream)
|
||||
add_subdirectory(vector-operations)
|
||||
add_subdirectory(hip-in-libraries)
|
||||
|
||||
set(CMAKE_BUILD_RPATH
|
||||
"\$ORIGIN:\$ORIGIN/../lib:$<TARGET_FILE_DIR:rocprofiler-sdk-roctx::rocprofiler-sdk-roctx-shared-library>"
|
||||
@@ -0,0 +1,34 @@
|
||||
#
|
||||
#
|
||||
#
|
||||
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-tests-bin-hip-in-libraries LANGUAGES CXX)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
|
||||
add_executable(hip-in-libraries)
|
||||
target_sources(hip-in-libraries PRIVATE hip-in-libraries.cpp)
|
||||
target_compile_options(hip-in-libraries PRIVATE -W -Wall -Wextra -Wpedantic -Wshadow
|
||||
-Werror)
|
||||
target_link_libraries(hip-in-libraries PRIVATE transpose-shared-library
|
||||
vector-ops-shared-library)
|
||||
|
||||
find_package(hip REQUIRED)
|
||||
target_link_libraries(hip-in-libraries PRIVATE hip::host)
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
target_link_libraries(hip-in-libraries PRIVATE Threads::Threads)
|
||||
|
||||
if(TRANSPOSE_USE_MPI)
|
||||
find_package(MPI REQUIRED)
|
||||
target_compile_definitions(hip-in-libraries PRIVATE USE_MPI)
|
||||
target_link_libraries(hip-in-libraries PRIVATE MPI::MPI_C)
|
||||
endif()
|
||||
|
||||
install(
|
||||
TARGETS hip-in-libraries
|
||||
DESTINATION bin
|
||||
COMPONENT tests)
|
||||
@@ -0,0 +1,151 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
|
||||
#include "transpose.hpp"
|
||||
#include "vector-ops.hpp"
|
||||
|
||||
#include <hip/hip_runtime_api.h>
|
||||
|
||||
#include <chrono>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <stdexcept>
|
||||
#include <thread>
|
||||
|
||||
#if defined(USE_MPI)
|
||||
# include <mpi.h>
|
||||
#endif
|
||||
|
||||
#define HIP_API_CALL(CALL) \
|
||||
{ \
|
||||
hipError_t error_ = (CALL); \
|
||||
if(error_ != hipSuccess) \
|
||||
{ \
|
||||
auto _hip_api_print_lk = auto_lock_t{print_lock}; \
|
||||
fprintf(stderr, \
|
||||
"%s:%d :: HIP error : %s\n", \
|
||||
__FILE__, \
|
||||
__LINE__, \
|
||||
hipGetErrorString(error_)); \
|
||||
throw std::runtime_error("hip_api_call"); \
|
||||
} \
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
using auto_lock_t = std::unique_lock<std::mutex>;
|
||||
auto print_lock = std::mutex{};
|
||||
size_t nqueues = 8;
|
||||
size_t nthreads = 4;
|
||||
size_t nitr = 500;
|
||||
size_t nsync = 10;
|
||||
} // namespace
|
||||
|
||||
int
|
||||
main(int argc, char** argv)
|
||||
{
|
||||
int rank = 0;
|
||||
int size = 1;
|
||||
|
||||
#if defined(USE_MPI)
|
||||
MPI_Init(&argc, &argv);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &size);
|
||||
#else
|
||||
(void) size;
|
||||
#endif
|
||||
|
||||
for(int i = 1; i < argc; ++i)
|
||||
{
|
||||
auto _arg = std::string{argv[i]};
|
||||
if(_arg == "?" || _arg == "-h" || _arg == "--help")
|
||||
{
|
||||
if(rank == 0)
|
||||
{
|
||||
fprintf(stderr,
|
||||
"usage: hip-in-libraries [NUM_QUEUES (%zu)] [NUM_THREADS (%zu)] "
|
||||
"[NUM_ITERATION (%zu)] "
|
||||
"[SYNC_EVERY_N_ITERATIONS (%zu)]\n",
|
||||
nqueues,
|
||||
nthreads,
|
||||
nitr,
|
||||
nsync);
|
||||
}
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
}
|
||||
|
||||
if(argc > 1) nqueues = atoll(argv[1]);
|
||||
if(argc > 2) nthreads = atoll(argv[2]);
|
||||
if(argc > 3) nitr = atoll(argv[3]);
|
||||
if(argc > 4) nsync = atoll(argv[4]);
|
||||
|
||||
int ndevice = 0;
|
||||
HIP_API_CALL(hipGetDeviceCount(&ndevice));
|
||||
|
||||
printf("[hip-in-libraries] Number of devices found: %i\n", ndevice);
|
||||
printf("[hip-in-libraries] Number of queues: %zu\n", nqueues);
|
||||
printf("[hip-in-libraries] Number of threads: %zu\n", nthreads);
|
||||
printf("[hip-in-libraries] Number of iterations: %zu\n", nitr);
|
||||
printf("[hip-in-libraries] Syncing every %zu iterations\n", nsync);
|
||||
|
||||
{
|
||||
auto vector_ops_thread = std::thread{run_vector_ops, nthreads, nqueues};
|
||||
auto transpose_thread = std::thread{run_transpose, nthreads, nitr, nsync};
|
||||
|
||||
vector_ops_thread.join();
|
||||
transpose_thread.join();
|
||||
}
|
||||
|
||||
// this is a temporary workaround in omnitrace when HIP + MPI is enabled
|
||||
|
||||
#if defined(USE_MPI)
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
#endif
|
||||
|
||||
for(int i = 0; i < ndevice; ++i)
|
||||
{
|
||||
HIP_API_CALL(hipSetDevice(i));
|
||||
HIP_API_CALL(hipDeviceSynchronize());
|
||||
}
|
||||
|
||||
#if defined(USE_MPI)
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
#endif
|
||||
|
||||
if(rank == 0)
|
||||
{
|
||||
for(int i = 0; i < ndevice; ++i)
|
||||
{
|
||||
HIP_API_CALL(hipSetDevice(i));
|
||||
HIP_API_CALL(hipDeviceReset());
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(USE_MPI)
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
+1
-1
@@ -17,7 +17,7 @@ if(NOT CMAKE_HIP_COMPILER)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
project(rocprofiler-test-app-multistream LANGUAGES CXX HIP)
|
||||
project(rocprofiler-tests-bin-multistream LANGUAGES CXX HIP)
|
||||
|
||||
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
|
||||
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
|
||||
+1
-1
@@ -17,7 +17,7 @@ if(NOT CMAKE_HIP_COMPILER)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
project(rocprofiler-test-app-reproducible-runtime LANGUAGES CXX HIP)
|
||||
project(rocprofiler-tests-bin-reproducible-runtime LANGUAGES CXX HIP)
|
||||
|
||||
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
|
||||
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
|
||||
+1
-1
@@ -17,7 +17,7 @@ if(NOT CMAKE_HIP_COMPILER)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
project(rocprofiler-tool-test-app-transpose LANGUAGES CXX HIP)
|
||||
project(rocprofiler-tests-bin-transpose LANGUAGES CXX HIP)
|
||||
|
||||
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
|
||||
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
|
||||
+1
-1
@@ -17,7 +17,7 @@ if(NOT CMAKE_HIP_COMPILER)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
project(rocprofiler-test-app-transpose LANGUAGES CXX HIP)
|
||||
project(rocprofiler-tests-bin-transpose LANGUAGES CXX HIP)
|
||||
|
||||
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
|
||||
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
|
||||
+5
-3
@@ -189,7 +189,7 @@ run(int rank, int tid, int devid, int argc, char** argv)
|
||||
HIP_API_CALL(hipStreamCreate(&stream));
|
||||
|
||||
auto_lock_t _lk{print_lock};
|
||||
std::cout << "[" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl;
|
||||
std::cout << "[transpose][" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl;
|
||||
_lk.unlock();
|
||||
|
||||
std::default_random_engine _engine{std::random_device{}() * (rank + 1) * (tid + 1)};
|
||||
@@ -230,8 +230,10 @@ run(int rank, int tid, int devid, int argc, char** argv)
|
||||
float GB = (float) size * nitr * 2 / (1 << 30);
|
||||
|
||||
print_lock.lock();
|
||||
std::cout << "[" << rank << "][" << tid << "] Runtime of transpose is " << time << " sec\n"
|
||||
<< "The average performance of transpose is " << GB / time << " GBytes/sec"
|
||||
std::cout << "[transpose][" << rank << "][" << tid << "] Runtime of transpose is " << time
|
||||
<< " sec\n";
|
||||
std::cout << "[transpose][" << rank << "][" << tid
|
||||
<< "] The average performance of transpose is " << GB / time << " GBytes/sec"
|
||||
<< std::endl;
|
||||
print_lock.unlock();
|
||||
|
||||
+1
-1
@@ -17,7 +17,7 @@ if(NOT CMAKE_HIP_COMPILER)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
project(rocprofiler-tool-test-app-transpose LANGUAGES CXX HIP)
|
||||
project(rocprofiler-tests-bin-vector-operations LANGUAGES CXX HIP)
|
||||
|
||||
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
|
||||
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
|
||||
+68
-46
@@ -1,24 +1,25 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#include <assert.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <stdio.h>
|
||||
@@ -132,8 +133,12 @@ divide_kernel(float* __restrict__ a,
|
||||
using namespace std;
|
||||
|
||||
void
|
||||
run(int NUM_QUEUE)
|
||||
run(int NUM_QUEUE, int DEVICE_ID)
|
||||
{
|
||||
HIP_API_CALL(hipSetDevice(DEVICE_ID));
|
||||
|
||||
HIP_API_CALL(hipDeviceSynchronize());
|
||||
|
||||
std::vector<float*> hostA(NUM_QUEUE);
|
||||
std::vector<float*> hostB(NUM_QUEUE);
|
||||
std::vector<float*> hostC(NUM_QUEUE);
|
||||
@@ -144,10 +149,18 @@ run(int NUM_QUEUE)
|
||||
|
||||
std::vector<hipStream_t> streams(NUM_QUEUE);
|
||||
|
||||
hipDeviceProp_t devProp;
|
||||
HIP_API_CALL(hipGetDeviceProperties(&devProp, 0));
|
||||
auto sync_stream = [NUM_QUEUE, streams](int q) {
|
||||
if(q < 0 || q >= NUM_QUEUE)
|
||||
throw std::runtime_error{std::string{"invalid stream id: "} + std::to_string(q)};
|
||||
|
||||
int i;
|
||||
HIP_API_CALL(hipStreamSynchronize(streams.at(q)));
|
||||
};
|
||||
|
||||
auto sync_streams = [NUM_QUEUE, sync_stream]() {
|
||||
for(int i = 0; i < NUM_QUEUE; ++i)
|
||||
sync_stream(i);
|
||||
HIP_API_CALL(hipDeviceSynchronize());
|
||||
};
|
||||
|
||||
for(int q = 0; q < NUM_QUEUE; q++)
|
||||
{
|
||||
@@ -158,26 +171,26 @@ run(int NUM_QUEUE)
|
||||
HIP_API_CALL(hipHostMalloc(&hostC[q], NUM * sizeof(float), 0));
|
||||
|
||||
// initialize the input data
|
||||
for(i = 0; i < NUM; i++)
|
||||
for(int i = 0; i < NUM; i++)
|
||||
{
|
||||
hostB[q][i] = (float) i;
|
||||
hostC[q][i] = (float) i * 100.0f;
|
||||
hostB[q][i] = static_cast<float>(i);
|
||||
hostC[q][i] = static_cast<float>(i * 100.0f);
|
||||
}
|
||||
|
||||
HIP_API_CALL(hipMalloc((void**) (&deviceA[q]), NUM * sizeof(float)));
|
||||
HIP_API_CALL(hipMalloc((void**) (&deviceB[q]), NUM * sizeof(float)));
|
||||
HIP_API_CALL(hipMalloc((void**) (&deviceC[q]), NUM * sizeof(float)));
|
||||
HIP_API_CALL(hipMallocAsync(&deviceA[q], NUM * sizeof(float), streams[q]));
|
||||
HIP_API_CALL(hipMallocAsync(&deviceB[q], NUM * sizeof(float), streams[q]));
|
||||
HIP_API_CALL(hipMallocAsync(&deviceC[q], NUM * sizeof(float), streams[q]));
|
||||
|
||||
HIP_API_CALL(hipMemcpyAsync(
|
||||
deviceB[q], hostB[q], NUM * sizeof(float), hipMemcpyHostToDevice, streams[q]));
|
||||
HIP_API_CALL(hipMemcpyAsync(
|
||||
deviceC[q], hostC[q], NUM * sizeof(float), hipMemcpyHostToDevice, streams[q]));
|
||||
}
|
||||
HIP_API_CALL(hipDeviceSynchronize());
|
||||
|
||||
for(int RUN_I = 0; RUN_I < 2; RUN_I++)
|
||||
sync_streams();
|
||||
|
||||
for(int q = 0; q < NUM_QUEUE; q++)
|
||||
{
|
||||
int q = (4 * RUN_I + 0) % NUM_QUEUE;
|
||||
hipLaunchKernelGGL(addition_kernel,
|
||||
dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
|
||||
@@ -189,8 +202,8 @@ run(int NUM_QUEUE)
|
||||
WIDTH,
|
||||
HEIGHT);
|
||||
|
||||
HIP_API_CALL(hipDeviceSynchronize());
|
||||
q = (4 * RUN_I + 1) % NUM_QUEUE;
|
||||
HIP_API_CALL(hipGetLastError());
|
||||
|
||||
hipLaunchKernelGGL(subtract_kernel,
|
||||
dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
|
||||
@@ -202,8 +215,8 @@ run(int NUM_QUEUE)
|
||||
WIDTH,
|
||||
HEIGHT);
|
||||
|
||||
HIP_API_CALL(hipDeviceSynchronize());
|
||||
q = (4 * RUN_I + 2) % NUM_QUEUE;
|
||||
HIP_API_CALL(hipGetLastError());
|
||||
|
||||
hipLaunchKernelGGL(multiply_kernel,
|
||||
dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
|
||||
@@ -215,8 +228,8 @@ run(int NUM_QUEUE)
|
||||
WIDTH,
|
||||
HEIGHT);
|
||||
|
||||
HIP_API_CALL(hipDeviceSynchronize());
|
||||
q = (4 * RUN_I + 3) % NUM_QUEUE;
|
||||
HIP_API_CALL(hipGetLastError());
|
||||
|
||||
hipLaunchKernelGGL(divide_kernel,
|
||||
dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
|
||||
@@ -227,17 +240,18 @@ run(int NUM_QUEUE)
|
||||
deviceC[q],
|
||||
WIDTH,
|
||||
HEIGHT);
|
||||
HIP_API_CALL(hipDeviceSynchronize());
|
||||
|
||||
HIP_API_CALL(hipGetLastError());
|
||||
}
|
||||
|
||||
for(int q = 0; q < NUM_QUEUE; q++)
|
||||
HIP_API_CALL(hipMemcpyAsync(
|
||||
hostA[q], deviceA[q], NUM * sizeof(float), hipMemcpyDeviceToHost, streams[q]));
|
||||
sync_streams();
|
||||
|
||||
for(int q = 0; q < NUM_QUEUE; q++)
|
||||
{
|
||||
HIP_API_CALL(hipMemcpy(hostA[q], deviceA[q], NUM * sizeof(float), hipMemcpyDeviceToHost));
|
||||
HIP_API_CALL(hipDeviceSynchronize());
|
||||
HIP_API_CALL(hipMemcpyAsync(
|
||||
hostA[q], deviceA[q], NUM * sizeof(float), hipMemcpyDeviceToHost, streams[q]));
|
||||
|
||||
sync_stream(q);
|
||||
|
||||
HIP_API_CALL(hipFree(deviceA[q]));
|
||||
HIP_API_CALL(hipFree(deviceB[q]));
|
||||
@@ -246,13 +260,21 @@ run(int NUM_QUEUE)
|
||||
HIP_API_CALL(hipHostFree(hostA[q]));
|
||||
HIP_API_CALL(hipHostFree(hostB[q]));
|
||||
HIP_API_CALL(hipHostFree(hostC[q]));
|
||||
|
||||
HIP_API_CALL(hipStreamDestroy(streams[q]));
|
||||
}
|
||||
|
||||
HIP_API_CALL(hipDeviceSynchronize());
|
||||
}
|
||||
|
||||
int
|
||||
main()
|
||||
{
|
||||
run(1);
|
||||
int device_count = 0;
|
||||
HIP_API_CALL(hipGetDeviceCount(&device_count));
|
||||
|
||||
for(int i = 0; i < device_count; ++i)
|
||||
run(4, i);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -35,4 +35,4 @@ set_tests_properties(
|
||||
PASS_REGULAR_EXPRESSION
|
||||
"Test C tool is using rocprofiler-sdk v([0-9]+\\.[0-9]+\\.[0-9]+)"
|
||||
FAIL_REGULAR_EXPRESSION
|
||||
"threw an exception")
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
|
||||
|
||||
@@ -6,6 +6,11 @@ include(FetchContent)
|
||||
|
||||
set(FETCHCONTENT_BASE_DIR ${PROJECT_BINARY_DIR}/external)
|
||||
|
||||
# default FAIL_REGULAR_EXPRESSION for tests
|
||||
set(ROCPROFILER_DEFAULT_FAIL_REGEX
|
||||
"threw an exception|Permission denied|Could not create logging file"
|
||||
CACHE STRING "Default FAIL_REGULAR_EXPRESSION for tests")
|
||||
|
||||
# build flags
|
||||
add_library(rocprofiler-tests-build-flags INTERFACE)
|
||||
add_library(rocprofiler::tests-build-flags ALIAS rocprofiler-tests-build-flags)
|
||||
|
||||
@@ -29,7 +29,7 @@ set_tests_properties(
|
||||
ENVIRONMENT
|
||||
"${PRELOAD_ENV};HSA_TOOLS_LIB=$<TARGET_FILE:rocprofiler::rocprofiler-shared-library>;ROCPROFILER_TOOL_OUTPUT_FILE=counter-collection-test.json;ROCPROFILER_TOOL_CONTEXTS=COUNTER_COLLECTION;ROCPROF_COUNTERS=SQ_WAVES_sum"
|
||||
FAIL_REGULAR_EXPRESSION
|
||||
"threw an exception")
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
|
||||
|
||||
foreach(FILENAME validate.py pytest.ini conftest.py)
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}
|
||||
@@ -44,4 +44,4 @@ set_tests_properties(
|
||||
test-counter-collection-validate
|
||||
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS
|
||||
test-counter-collection-execute FAIL_REGULAR_EXPRESSION
|
||||
"threw an exception")
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
|
||||
|
||||
@@ -34,7 +34,7 @@ set(kernel-tracing-env
|
||||
set_tests_properties(
|
||||
test-kernel-tracing-execute
|
||||
PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT "${kernel-tracing-env}"
|
||||
FAIL_REGULAR_EXPRESSION "threw an exception")
|
||||
FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
|
||||
|
||||
foreach(FILENAME validate.py pytest.ini conftest.py)
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}
|
||||
@@ -49,4 +49,4 @@ add_test(
|
||||
set_tests_properties(
|
||||
test-kernel-tracing-validate
|
||||
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS test-kernel-tracing-execute
|
||||
FAIL_REGULAR_EXPRESSION "threw an exception")
|
||||
FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
|
||||
|
||||
@@ -0,0 +1,18 @@
|
||||
#
|
||||
# Integration test application libraries
|
||||
#
|
||||
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-tests-lib LANGUAGES C CXX)
|
||||
|
||||
set(CMAKE_BUILD_RPATH "\$ORIGIN:\$ORIGIN/../lib")
|
||||
|
||||
# libraries used by integration test apps which DO NOT link to rocprofiler-sdk-roctx
|
||||
add_subdirectory(vector-operations)
|
||||
|
||||
set(CMAKE_BUILD_RPATH
|
||||
"\$ORIGIN:\$ORIGIN/../lib:$<TARGET_FILE_DIR:rocprofiler-sdk-roctx::rocprofiler-sdk-roctx-shared-library>"
|
||||
)
|
||||
|
||||
# libraries used by integration test apps which DO link to rocprofiler-sdk-roctx
|
||||
add_subdirectory(transpose)
|
||||
@@ -0,0 +1,61 @@
|
||||
#
|
||||
#
|
||||
#
|
||||
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
|
||||
|
||||
if(NOT CMAKE_HIP_COMPILER)
|
||||
find_program(
|
||||
amdclangpp_EXECUTABLE
|
||||
NAMES amdclang++
|
||||
HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
|
||||
PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
|
||||
PATH_SUFFIXES bin llvm/bin NO_CACHE)
|
||||
mark_as_advanced(amdclangpp_EXECUTABLE)
|
||||
|
||||
if(amdclangpp_EXECUTABLE)
|
||||
set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
project(rocprofiler-tests-lib-transpose-shared-library LANGUAGES CXX HIP)
|
||||
|
||||
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
|
||||
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
|
||||
set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
option(TRANSPOSE_USE_MPI "Enable MPI support in transpose-shared-library exe" OFF)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_HIP_STANDARD 17)
|
||||
set(CMAKE_HIP_EXTENSIONS OFF)
|
||||
set(CMAKE_HIP_STANDARD_REQUIRED ON)
|
||||
|
||||
set_source_files_properties(transpose.cpp PROPERTIES LANGUAGE HIP)
|
||||
add_library(transpose-shared-library SHARED)
|
||||
target_sources(transpose-shared-library PRIVATE transpose.cpp)
|
||||
target_compile_options(transpose-shared-library PRIVATE -W -Wall -Wextra -Wpedantic
|
||||
-Wshadow -Werror)
|
||||
target_include_directories(transpose-shared-library PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
set_target_properties(transpose-shared-library PROPERTIES OUTPUT_NAME transpose)
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
target_link_libraries(transpose-shared-library PRIVATE Threads::Threads)
|
||||
|
||||
find_package(rocprofiler-sdk-roctx REQUIRED)
|
||||
target_link_libraries(transpose-shared-library
|
||||
PRIVATE rocprofiler-sdk-roctx::rocprofiler-sdk-roctx)
|
||||
|
||||
if(TRANSPOSE_USE_MPI)
|
||||
find_package(MPI REQUIRED)
|
||||
target_compile_definitions(transpose-shared-library PRIVATE USE_MPI)
|
||||
target_link_libraries(transpose-shared-library PRIVATE MPI::MPI_C)
|
||||
endif()
|
||||
|
||||
install(
|
||||
TARGETS transpose-shared-library
|
||||
DESTINATION lib
|
||||
COMPONENT tests)
|
||||
@@ -0,0 +1,260 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "rocprofiler-sdk-roctx/roctx.h"
|
||||
|
||||
#include <chrono>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <random>
|
||||
#include <stdexcept>
|
||||
|
||||
#if defined(USE_MPI)
|
||||
# include <mpi.h>
|
||||
#endif
|
||||
|
||||
#define HIP_API_CALL(CALL) \
|
||||
{ \
|
||||
hipError_t error_ = (CALL); \
|
||||
if(error_ != hipSuccess) \
|
||||
{ \
|
||||
auto _hip_api_print_lk = auto_lock_t{print_lock}; \
|
||||
fprintf(stderr, \
|
||||
"%s:%d :: HIP error : %s\n", \
|
||||
__FILE__, \
|
||||
__LINE__, \
|
||||
hipGetErrorString(error_)); \
|
||||
throw std::runtime_error("hip_api_call"); \
|
||||
} \
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
using auto_lock_t = std::unique_lock<std::mutex>;
|
||||
auto print_lock = std::mutex{};
|
||||
constexpr unsigned shared_mem_tile_dim = 32;
|
||||
|
||||
void
|
||||
check_hip_error(void);
|
||||
|
||||
void
|
||||
verify(int* in, int* out, int M, int N);
|
||||
|
||||
__global__ void
|
||||
transpose(const int* in, int* out, int M, int N);
|
||||
|
||||
void
|
||||
run_transpose_impl(int rank, int tid, int ndevice, size_t nitr, size_t nsync);
|
||||
|
||||
__global__ void
|
||||
transpose(const int* in, int* out, int M, int N)
|
||||
{
|
||||
__shared__ int tile[shared_mem_tile_dim][shared_mem_tile_dim];
|
||||
|
||||
int idx = (blockIdx.y * blockDim.y + threadIdx.y) * M + blockIdx.x * blockDim.x + threadIdx.x;
|
||||
tile[threadIdx.y][threadIdx.x] = in[idx];
|
||||
__syncthreads();
|
||||
idx = (blockIdx.x * blockDim.x + threadIdx.y) * N + blockIdx.y * blockDim.y + threadIdx.x;
|
||||
out[idx] = tile[threadIdx.x][threadIdx.y];
|
||||
}
|
||||
|
||||
void
|
||||
run_transpose_impl(int rank, int tid, int devid, size_t nitr, size_t nsync)
|
||||
{
|
||||
roctxRangePush("run_transpose_impl");
|
||||
|
||||
constexpr unsigned int M = 4960 * 2;
|
||||
constexpr unsigned int N = 4960 * 2;
|
||||
|
||||
hipStream_t stream = {};
|
||||
|
||||
printf("[transpose] Rank %i, thread %i assigned to device %i\n", rank, tid, devid);
|
||||
HIP_API_CALL(hipSetDevice(devid));
|
||||
HIP_API_CALL(hipStreamCreate(&stream));
|
||||
|
||||
auto_lock_t _lk{print_lock};
|
||||
std::cout << "[transpose][" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl;
|
||||
_lk.unlock();
|
||||
|
||||
std::default_random_engine _engine{std::random_device{}() * (rank + 1) * (tid + 1)};
|
||||
std::uniform_int_distribution<int> _dist{0, 1000};
|
||||
|
||||
size_t size = sizeof(int) * M * N;
|
||||
int* inp_matrix = new int[size];
|
||||
int* out_matrix = new int[size];
|
||||
for(size_t i = 0; i < M * N; i++)
|
||||
{
|
||||
inp_matrix[i] = _dist(_engine);
|
||||
out_matrix[i] = 0;
|
||||
}
|
||||
int* in = nullptr;
|
||||
int* out = nullptr;
|
||||
|
||||
HIP_API_CALL(hipMalloc(&in, size));
|
||||
HIP_API_CALL(hipMalloc(&out, size));
|
||||
HIP_API_CALL(hipMemsetAsync(in, 0, size, stream));
|
||||
HIP_API_CALL(hipMemsetAsync(out, 0, size, stream));
|
||||
HIP_API_CALL(hipMemcpyAsync(in, inp_matrix, size, hipMemcpyHostToDevice, stream));
|
||||
HIP_API_CALL(hipStreamSynchronize(stream));
|
||||
|
||||
dim3 grid(M / 32, N / 32, 1);
|
||||
dim3 block(32, 32, 1); // transpose
|
||||
|
||||
auto t1 = std::chrono::high_resolution_clock::now();
|
||||
for(size_t i = 0; i < nitr; ++i)
|
||||
{
|
||||
transpose<<<grid, block, 0, stream>>>(in, out, M, N);
|
||||
check_hip_error();
|
||||
if(i % nsync == (nsync - 1)) HIP_API_CALL(hipStreamSynchronize(stream));
|
||||
}
|
||||
auto t2 = std::chrono::high_resolution_clock::now();
|
||||
HIP_API_CALL(hipStreamSynchronize(stream));
|
||||
HIP_API_CALL(hipMemcpyAsync(out_matrix, out, size, hipMemcpyDeviceToHost, stream));
|
||||
double time = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count();
|
||||
float GB = (float) size * nitr * 2 / (1 << 30);
|
||||
|
||||
print_lock.lock();
|
||||
std::cout << "[transpose][" << rank << "][" << tid << "] Runtime of transpose is " << time
|
||||
<< " sec\n";
|
||||
std::cout << "[transpose][" << rank << "][" << tid
|
||||
<< "] The average performance of transpose is " << GB / time << " GBytes/sec"
|
||||
<< std::endl;
|
||||
print_lock.unlock();
|
||||
|
||||
HIP_API_CALL(hipStreamSynchronize(stream));
|
||||
HIP_API_CALL(hipStreamDestroy(stream));
|
||||
|
||||
// cpu_transpose(matrix, out_matrix, M, N);
|
||||
verify(inp_matrix, out_matrix, M, N);
|
||||
|
||||
HIP_API_CALL(hipFree(in));
|
||||
HIP_API_CALL(hipFree(out));
|
||||
|
||||
delete[] inp_matrix;
|
||||
delete[] out_matrix;
|
||||
|
||||
roctxRangePop();
|
||||
}
|
||||
|
||||
void
|
||||
check_hip_error(void)
|
||||
{
|
||||
hipError_t err = hipGetLastError();
|
||||
if(err != hipSuccess)
|
||||
{
|
||||
auto_lock_t _lk{print_lock};
|
||||
std::cerr << "Error: " << hipGetErrorString(err) << std::endl;
|
||||
throw std::runtime_error("hip_api_call");
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
verify(int* in, int* out, int M, int N)
|
||||
{
|
||||
for(int i = 0; i < 10; i++)
|
||||
{
|
||||
int row = rand() % M;
|
||||
int col = rand() % N;
|
||||
if(in[row * N + col] != out[col * M + row])
|
||||
{
|
||||
auto_lock_t _lk{print_lock};
|
||||
std::cout << "mismatch: " << row << ", " << col << " : " << in[row * N + col] << " | "
|
||||
<< out[col * M + row] << "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void
|
||||
run_transpose(size_t nthreads, size_t nitr, size_t nsync)
|
||||
{
|
||||
auto range_id = roctxRangeStart("run_transpose");
|
||||
|
||||
int rank = 0;
|
||||
int size = 1;
|
||||
|
||||
printf("[transpose] Number of threads: %zu\n", nthreads);
|
||||
printf("[transpose] Number of iterations: %zu\n", nitr);
|
||||
printf("[transpose] Syncing every %zu iterations\n", nsync);
|
||||
|
||||
#if defined(USE_MPI)
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &size);
|
||||
#else
|
||||
(void) size;
|
||||
#endif
|
||||
// this is a temporary workaround in omnitrace when HIP + MPI is enabled
|
||||
int ndevice = 0;
|
||||
HIP_API_CALL(hipGetDeviceCount(&ndevice));
|
||||
printf("[transpose] Number of devices found: %i\n", ndevice);
|
||||
auto devids = std::vector<int>{};
|
||||
devids.resize(size * nthreads, 0);
|
||||
int devid = 0;
|
||||
for(size_t i = 0; i < nthreads; ++i)
|
||||
{
|
||||
for(int j = 0; j < size; ++j)
|
||||
{
|
||||
auto idx = (j * nthreads) + i;
|
||||
devids.at(idx) = devid++ % ndevice;
|
||||
}
|
||||
}
|
||||
auto devid_offset = (rank * nthreads);
|
||||
auto _threads = std::vector<std::thread>{};
|
||||
for(size_t i = 1; i < nthreads; ++i)
|
||||
_threads.emplace_back(
|
||||
run_transpose_impl, rank, i, devids.at(devid_offset + i), nitr, nsync);
|
||||
run_transpose_impl(rank, 0, devids.at(devid_offset + 0), nitr, nsync);
|
||||
for(auto& itr : _threads)
|
||||
itr.join();
|
||||
|
||||
#if defined(USE_MPI)
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
#endif
|
||||
|
||||
// for(int i = 0; i < ndevice; ++i)
|
||||
// {
|
||||
// HIP_API_CALL(hipSetDevice(i));
|
||||
// HIP_API_CALL(hipDeviceSynchronize());
|
||||
// }
|
||||
|
||||
// #if defined(USE_MPI)
|
||||
// MPI_Barrier(MPI_COMM_WORLD);
|
||||
// #endif
|
||||
|
||||
// if(rank == 0)
|
||||
// {
|
||||
// for(int i = 0; i < ndevice; ++i)
|
||||
// {
|
||||
// HIP_API_CALL(hipSetDevice(i));
|
||||
// HIP_API_CALL(hipDeviceReset());
|
||||
// }
|
||||
// }
|
||||
|
||||
// #if defined(USE_MPI)
|
||||
// MPI_Barrier(MPI_COMM_WORLD);
|
||||
// #endif
|
||||
|
||||
roctxRangeStop(range_id);
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
void
|
||||
run_transpose(size_t nthreads, size_t nitr, size_t nsync);
|
||||
@@ -0,0 +1,49 @@
|
||||
#
|
||||
#
|
||||
#
|
||||
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
|
||||
|
||||
if(NOT CMAKE_HIP_COMPILER)
|
||||
find_program(
|
||||
amdclangpp_EXECUTABLE
|
||||
NAMES amdclang++
|
||||
HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
|
||||
PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
|
||||
PATH_SUFFIXES bin llvm/bin NO_CACHE)
|
||||
mark_as_advanced(amdclangpp_EXECUTABLE)
|
||||
|
||||
if(amdclangpp_EXECUTABLE)
|
||||
set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
project(rocprofiler-tests-lib-vector-operations LANGUAGES CXX HIP)
|
||||
|
||||
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
|
||||
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
|
||||
set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_HIP_STANDARD 17)
|
||||
set(CMAKE_HIP_EXTENSIONS OFF)
|
||||
set(CMAKE_HIP_STANDARD_REQUIRED ON)
|
||||
|
||||
set_source_files_properties(vector-ops.cpp PROPERTIES LANGUAGE HIP)
|
||||
add_library(vector-ops-shared-library SHARED)
|
||||
target_sources(vector-ops-shared-library PRIVATE vector-ops.cpp)
|
||||
target_compile_options(vector-ops-shared-library PRIVATE -W -Wall -Wextra -Wpedantic
|
||||
-Wshadow -Werror)
|
||||
target_include_directories(vector-ops-shared-library PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
set_target_properties(vector-ops-shared-library PROPERTIES OUTPUT_NAME vector-ops)
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
target_link_libraries(vector-ops-shared-library PRIVATE Threads::Threads)
|
||||
|
||||
install(
|
||||
TARGETS vector-ops-shared-library
|
||||
DESTINATION lib
|
||||
COMPONENT tests)
|
||||
@@ -0,0 +1,291 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
|
||||
#include <assert.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <vector>
|
||||
|
||||
#define HIP_API_CALL(CALL) \
|
||||
{ \
|
||||
hipError_t error_ = (CALL); \
|
||||
if(error_ != hipSuccess) \
|
||||
{ \
|
||||
auto _hip_api_print_lk = auto_lock_t{print_lock}; \
|
||||
fprintf(stderr, \
|
||||
"%s:%d :: HIP error : %s\n", \
|
||||
__FILE__, \
|
||||
__LINE__, \
|
||||
hipGetErrorString(error_)); \
|
||||
throw std::runtime_error("hip_api_call"); \
|
||||
} \
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
using auto_lock_t = std::unique_lock<std::mutex>;
|
||||
auto print_lock = std::mutex{};
|
||||
|
||||
constexpr auto WIDTH = (1 << 12); // 4096
|
||||
constexpr auto HEIGHT = (1 << 11); // 2048
|
||||
constexpr auto DEPTH = (1 << 0); // 1
|
||||
constexpr auto NUM = (WIDTH * HEIGHT * DEPTH);
|
||||
|
||||
struct dimensions
|
||||
{
|
||||
int x = 1;
|
||||
int y = 1;
|
||||
int z = 1;
|
||||
};
|
||||
|
||||
constexpr auto threads_per_block = dimensions{64, 1, 1};
|
||||
|
||||
// Computes vectorAdd with matrix-multiply
|
||||
template <typename Tp>
|
||||
__global__ void
|
||||
addition_kernel(Tp* __restrict__ a,
|
||||
const Tp* __restrict__ b,
|
||||
const Tp* __restrict__ c,
|
||||
int width,
|
||||
int /*height*/)
|
||||
{
|
||||
// printf("addition kernel\n");
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if(x >= WIDTH || y >= HEIGHT) return;
|
||||
int index = y * width + x;
|
||||
|
||||
a[index] = b[index] + c[index];
|
||||
}
|
||||
|
||||
template <typename Tp>
|
||||
__global__ void
|
||||
subtract_kernel(Tp* __restrict__ a,
|
||||
const Tp* __restrict__ b,
|
||||
const Tp* __restrict__ c,
|
||||
int width,
|
||||
int /*height*/)
|
||||
{
|
||||
// printf("subtract kernel\n");
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if(x >= WIDTH || y >= HEIGHT) return;
|
||||
int index = y * width + x;
|
||||
|
||||
a[index] = abs(b[index] - c[index]);
|
||||
}
|
||||
|
||||
template <typename Tp>
|
||||
__global__ void
|
||||
multiply_kernel(Tp* __restrict__ a,
|
||||
const Tp* __restrict__ b,
|
||||
const Tp* __restrict__ c,
|
||||
int width,
|
||||
int /*height*/)
|
||||
{
|
||||
// printf("multiply kernel\n");
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if(x >= WIDTH || y >= HEIGHT) return;
|
||||
int index = y * width + x;
|
||||
|
||||
a[index] = (b[index] - 1) * (c[index] - 1) + 1;
|
||||
}
|
||||
|
||||
template <typename Tp>
|
||||
__global__ void
|
||||
divide_kernel(Tp* __restrict__ a,
|
||||
const Tp* __restrict__ b,
|
||||
const Tp* __restrict__ c,
|
||||
int width,
|
||||
int /*height*/)
|
||||
{
|
||||
// printf("divide kernel\n");
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if(x >= WIDTH || y >= HEIGHT) return;
|
||||
int index = y * width + x;
|
||||
|
||||
a[index] = (b[index] - c[index]) / abs(c[index] + b[index]) + 1;
|
||||
}
|
||||
|
||||
void
|
||||
run_vector_ops_impl(int num_queue, int device_id)
|
||||
{
|
||||
auto t1 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
HIP_API_CALL(hipSetDevice(device_id));
|
||||
|
||||
std::vector<float*> hostA(num_queue);
|
||||
std::vector<float*> hostB(num_queue);
|
||||
std::vector<float*> hostC(num_queue);
|
||||
|
||||
std::vector<float*> deviceA(num_queue);
|
||||
std::vector<float*> deviceB(num_queue);
|
||||
std::vector<float*> deviceC(num_queue);
|
||||
|
||||
std::vector<hipStream_t> streams(num_queue);
|
||||
|
||||
auto sync_stream = [num_queue, streams](int q) {
|
||||
if(q < 0 || q >= num_queue)
|
||||
throw std::runtime_error{std::string{"invalid stream id: "} + std::to_string(q)};
|
||||
|
||||
HIP_API_CALL(hipStreamSynchronize(streams.at(q)));
|
||||
};
|
||||
|
||||
auto sync_streams = [num_queue, sync_stream]() {
|
||||
for(int i = 0; i < num_queue; ++i)
|
||||
sync_stream(i);
|
||||
};
|
||||
|
||||
for(int q = 0; q < num_queue; q++)
|
||||
{
|
||||
HIP_API_CALL(hipStreamCreateWithFlags(&streams[q], hipStreamNonBlocking));
|
||||
|
||||
HIP_API_CALL(hipHostMalloc(&hostA[q], NUM * sizeof(float), 0));
|
||||
HIP_API_CALL(hipHostMalloc(&hostB[q], NUM * sizeof(float), 0));
|
||||
HIP_API_CALL(hipHostMalloc(&hostC[q], NUM * sizeof(float), 0));
|
||||
|
||||
// initialize the input data
|
||||
for(int i = 0; i < NUM; i++)
|
||||
{
|
||||
hostB[q][i] = static_cast<float>(i);
|
||||
hostC[q][i] = static_cast<float>(i * 100.0f);
|
||||
}
|
||||
|
||||
HIP_API_CALL(hipMallocAsync(&deviceA[q], NUM * sizeof(float), streams[q]));
|
||||
HIP_API_CALL(hipMallocAsync(&deviceB[q], NUM * sizeof(float), streams[q]));
|
||||
HIP_API_CALL(hipMallocAsync(&deviceC[q], NUM * sizeof(float), streams[q]));
|
||||
|
||||
HIP_API_CALL(hipMemcpyAsync(
|
||||
deviceB[q], hostB[q], NUM * sizeof(float), hipMemcpyHostToDevice, streams[q]));
|
||||
HIP_API_CALL(hipMemcpyAsync(
|
||||
deviceC[q], hostC[q], NUM * sizeof(float), hipMemcpyHostToDevice, streams[q]));
|
||||
}
|
||||
|
||||
sync_streams();
|
||||
|
||||
for(int q = 0; q < num_queue; q++)
|
||||
{
|
||||
hipLaunchKernelGGL(addition_kernel,
|
||||
dim3(WIDTH / threads_per_block.x, HEIGHT / threads_per_block.y),
|
||||
dim3(threads_per_block.x, threads_per_block.y),
|
||||
0,
|
||||
streams[q],
|
||||
deviceA[q],
|
||||
deviceB[q],
|
||||
deviceC[q],
|
||||
WIDTH,
|
||||
HEIGHT);
|
||||
|
||||
hipLaunchKernelGGL(subtract_kernel,
|
||||
dim3(WIDTH / threads_per_block.x, HEIGHT / threads_per_block.y),
|
||||
dim3(threads_per_block.x, threads_per_block.y),
|
||||
0,
|
||||
streams[q],
|
||||
deviceA[q],
|
||||
deviceB[q],
|
||||
deviceC[q],
|
||||
WIDTH,
|
||||
HEIGHT);
|
||||
|
||||
hipLaunchKernelGGL(multiply_kernel,
|
||||
dim3(WIDTH / threads_per_block.x, HEIGHT / threads_per_block.y),
|
||||
dim3(threads_per_block.x, threads_per_block.y),
|
||||
0,
|
||||
streams[q],
|
||||
deviceA[q],
|
||||
deviceB[q],
|
||||
deviceC[q],
|
||||
WIDTH,
|
||||
HEIGHT);
|
||||
|
||||
hipLaunchKernelGGL(divide_kernel,
|
||||
dim3(WIDTH / threads_per_block.x, HEIGHT / threads_per_block.y),
|
||||
dim3(threads_per_block.x, threads_per_block.y),
|
||||
0,
|
||||
streams[q],
|
||||
deviceB[q],
|
||||
deviceA[q],
|
||||
deviceC[q],
|
||||
WIDTH,
|
||||
HEIGHT);
|
||||
}
|
||||
|
||||
sync_streams();
|
||||
|
||||
for(int q = 0; q < num_queue; q++)
|
||||
{
|
||||
HIP_API_CALL(hipMemcpyAsync(
|
||||
hostA[q], deviceA[q], NUM * sizeof(float), hipMemcpyDeviceToHost, streams[q]));
|
||||
|
||||
sync_stream(q);
|
||||
|
||||
HIP_API_CALL(hipFree(deviceA[q]));
|
||||
HIP_API_CALL(hipFree(deviceB[q]));
|
||||
HIP_API_CALL(hipFree(deviceC[q]));
|
||||
|
||||
HIP_API_CALL(hipHostFree(hostA[q]));
|
||||
HIP_API_CALL(hipHostFree(hostB[q]));
|
||||
HIP_API_CALL(hipHostFree(hostC[q]));
|
||||
|
||||
HIP_API_CALL(hipStreamDestroy(streams[q]));
|
||||
}
|
||||
|
||||
auto t2 = std::chrono::high_resolution_clock::now();
|
||||
double time = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count();
|
||||
|
||||
print_lock.lock();
|
||||
std::cout << "[vector-ops] Runtime of vector-ops is " << time << " sec\n";
|
||||
print_lock.unlock();
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void
|
||||
run_vector_ops(int num_threads, int num_queue)
|
||||
{
|
||||
int device_count = 0;
|
||||
HIP_API_CALL(hipGetDeviceCount(&device_count));
|
||||
|
||||
if(device_count == 0) throw std::runtime_error{"No HIP devices found"};
|
||||
|
||||
num_threads = std::max<int>(num_threads, 1);
|
||||
num_queue = std::max<int>(num_queue, 1);
|
||||
|
||||
auto _threads = std::vector<std::thread>{};
|
||||
_threads.reserve(num_threads);
|
||||
|
||||
for(int i = 0; i < num_threads; ++i)
|
||||
_threads.emplace_back(run_vector_ops_impl, num_queue, i % device_count);
|
||||
|
||||
for(auto& itr : _threads)
|
||||
itr.join();
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
void
|
||||
run_vector_ops(int num_threads, int num_queue);
|
||||
@@ -24,5 +24,6 @@ enable_testing()
|
||||
include(CTest)
|
||||
|
||||
add_subdirectory(tracing)
|
||||
add_subdirectory(counter-collection)
|
||||
add_subdirectory(tracing-plus-cc)
|
||||
add_subdirectory(tracing-hip-in-libraries)
|
||||
add_subdirectory(counter-collection)
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
#
|
||||
# Various counter collection tests
|
||||
#
|
||||
|
||||
foreach(FILENAME conftest.py pytest.ini)
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}
|
||||
${CMAKE_CURRENT_BINARY_DIR}/${FILENAME} COPYONLY)
|
||||
|
||||
@@ -40,7 +40,7 @@ set(cc-env-pmc1
|
||||
set_tests_properties(
|
||||
rocprofv3-test-counter-collection-pmc1-execute
|
||||
PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT "${cc-env-pmc1}"
|
||||
FAIL_REGULAR_EXPRESSION "threw an exception")
|
||||
FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
|
||||
|
||||
add_test(NAME rocprofv3-test-counter-collection-pmc1-validate
|
||||
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --input
|
||||
@@ -50,4 +50,4 @@ set_tests_properties(
|
||||
rocprofv3-test-counter-collection-pmc1-validate
|
||||
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS
|
||||
rocprofv3-test-counter-collection-pmc1-execute FAIL_REGULAR_EXPRESSION
|
||||
"threw an exception")
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
|
||||
|
||||
@@ -1,30 +1,46 @@
|
||||
import pandas as pd
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
import pytest
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
kernel_list = ["addition_kernel", "subtract_kernel", "multiply_kernel", "divide_kernel"]
|
||||
kernel_list = sorted(
|
||||
["addition_kernel", "subtract_kernel", "multiply_kernel", "divide_kernel"]
|
||||
)
|
||||
|
||||
|
||||
def unique(lst):
|
||||
return list(set(lst))
|
||||
|
||||
|
||||
def test_validate_counter_collection_pmc1(input_data: pd.DataFrame):
|
||||
df = input_data
|
||||
|
||||
assert df.empty == False
|
||||
assert not df.empty
|
||||
assert (df["Agent_Id"].astype(int).values > 0).all()
|
||||
assert (df["Queue_Id"].astype(int).values > 0).all()
|
||||
assert (df["Process_Id"].astype(int).values > 0).all()
|
||||
assert len(df["Kernel-Name"]) > 0
|
||||
df_list = df["Kernel-Name"].values.flatten().tolist()
|
||||
# Check if each string in kernel_list is present at least once
|
||||
missing_kernels = []
|
||||
for kernel in kernel_list:
|
||||
if kernel not in df_list:
|
||||
missing_kernels.append(kernel)
|
||||
assert len(df["Kernel_Name"]) > 0
|
||||
|
||||
assert kernel_list == sorted(df["Kernel_Name"].unique().tolist())
|
||||
|
||||
kernel_count = dict([[itr, 0] for itr in kernel_list])
|
||||
assert len(kernel_count) == len(kernel_list)
|
||||
for itr in df["Kernel_Name"]:
|
||||
kernel_count[itr] += 1
|
||||
kn_cnt = [itr for _, itr in kernel_count.items()]
|
||||
assert min(kn_cnt) == max(kn_cnt) and len(unique(kn_cnt)) == 1
|
||||
|
||||
assert (
|
||||
not missing_kernels
|
||||
), f"The following kernel names are missing from the out file: {missing_kernels}"
|
||||
assert df["Counter_Name"].str.contains("SQ_WAVES").all()
|
||||
assert len(df["Counter_Value"]) > 0
|
||||
assert df["Counter_Name"].str.contains("SQ_WAVES").all()
|
||||
assert (df["Counter_Value"].astype(int).values > 0).all()
|
||||
|
||||
di_list = df["Dispatch_Id"].astype(int).values.tolist()
|
||||
di_uniq = sorted(df["Dispatch_Id"].unique().tolist())
|
||||
# make sure the dispatch ids are unique and ordered
|
||||
di_expect = [idx + 1 for idx in range(len(di_list))]
|
||||
assert di_expect == di_uniq
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -40,7 +40,7 @@ set(cc-env-pmc2
|
||||
set_tests_properties(
|
||||
rocprofv3-test-counter-collection-pmc2-execute
|
||||
PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT "${cc-env-pmc2}"
|
||||
FAIL_REGULAR_EXPRESSION "threw an exception")
|
||||
FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
|
||||
|
||||
add_test(NAME rocprofv3-test-counter-collection-pmc2-validate
|
||||
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py
|
||||
@@ -50,4 +50,4 @@ set_tests_properties(
|
||||
rocprofv3-test-counter-collection-pmc2-validate
|
||||
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS
|
||||
rocprofv3-test-counter-collection-pmc2-execute FAIL_REGULAR_EXPRESSION
|
||||
"threw an exception")
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
|
||||
|
||||
@@ -34,7 +34,7 @@ def test_validate_counter_collection_pmc2(input_dir: pd.DataFrame):
|
||||
with open(file_path, "r") as file:
|
||||
df = pd.read_csv(file)
|
||||
# check if kernel-name is present
|
||||
assert len(df["Kernel-Name"]) > 0
|
||||
assert len(df["Kernel_Name"]) > 0
|
||||
# check if counter value is positive
|
||||
assert len(df["Counter_Value"]) > 0
|
||||
|
||||
|
||||
+2
-2
@@ -41,7 +41,7 @@ set(cc-env-list-metrics
|
||||
set_tests_properties(
|
||||
rocprofv3-test-list-metrics-execute
|
||||
PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT "${cc-env-list-metrics}"
|
||||
FAIL_REGULAR_EXPRESSION "threw an exception")
|
||||
FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
|
||||
|
||||
set_tests_properties(
|
||||
rocprofv3-test-list-metrics-std-out-execute
|
||||
@@ -76,6 +76,6 @@ set_tests_properties(
|
||||
DEPENDS
|
||||
rocprofv3-test-list-metrics-execute
|
||||
FAIL_REGULAR_EXPRESSION
|
||||
"threw an exception"
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}"
|
||||
ATTACHED_FILES_ON_FAIL
|
||||
"${VALIDATION_FILES}")
|
||||
|
||||
@@ -3,3 +3,4 @@
|
||||
addopts = --durations=20 -rA -s -vv
|
||||
testpaths = input1/validate.py
|
||||
input2/validate.py
|
||||
list_metrics/validate.py
|
||||
|
||||
@@ -0,0 +1,72 @@
|
||||
#
|
||||
# rocprofv3 tool test
|
||||
#
|
||||
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
|
||||
|
||||
project(
|
||||
rocprofiler-tests-rocprofv3-tracing-hip-in-libraries
|
||||
LANGUAGES CXX
|
||||
VERSION 0.0.0)
|
||||
|
||||
find_package(rocprofiler-sdk REQUIRED)
|
||||
|
||||
add_test(
|
||||
NAME rocprofv3-test-trace-hip-in-libraries-execute
|
||||
COMMAND
|
||||
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> --hip-runtime-trace
|
||||
--hip-compiler-trace --hsa-core-trace --hsa-amd-trace --hsa-image-trace
|
||||
--hsa-finalizer-trace --kernel-trace --memory-copy-trace -d
|
||||
${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace -o out $<TARGET_FILE:hip-in-libraries>)
|
||||
|
||||
string(REPLACE "LD_PRELOAD=" "ROCPROF_PRELOAD=" PRELOAD_ENV
|
||||
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV}")
|
||||
|
||||
set(tracing-env "${PRELOAD_ENV}"
|
||||
"HSA_TOOLS_LIB=$<TARGET_FILE:rocprofiler::rocprofiler-shared-library>")
|
||||
|
||||
set_tests_properties(
|
||||
rocprofv3-test-trace-hip-in-libraries-execute
|
||||
PROPERTIES
|
||||
LABELS
|
||||
"integration-tests"
|
||||
ENVIRONMENT
|
||||
"${tracing-env}"
|
||||
FAIL_REGULAR_EXPRESSION
|
||||
"HSA_CORE_API|HSA_AMD_EXT_API|HSA_IMAGE_EXT_API|HSA_FINALIZER_EXT_API|HIP_API|HIP_COMPILER_API|KERNEL_DISPATCH|CODE_OBJECT"
|
||||
)
|
||||
|
||||
foreach(FILENAME validate.py conftest.py)
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}
|
||||
${CMAKE_CURRENT_BINARY_DIR}/${FILENAME} COPYONLY)
|
||||
endforeach()
|
||||
|
||||
add_test(
|
||||
NAME rocprofv3-test-trace-hip-in-libraries-validate
|
||||
COMMAND
|
||||
${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --hsa-input
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_hsa_api_trace.csv
|
||||
--hip-input
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_hip_api_trace.csv
|
||||
--kernel-input
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_kernel_trace.csv
|
||||
--memory-copy-input
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_memory_copy_trace.csv)
|
||||
|
||||
set(VALIDATION_FILES
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_memory_copy_trace.csv
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_hsa_api_trace.csv
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_hip_api_trace.csv
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hip-in-libraries-trace/out_kernel_trace.csv)
|
||||
|
||||
set_tests_properties(
|
||||
rocprofv3-test-trace-hip-in-libraries-validate
|
||||
PROPERTIES TIMEOUT
|
||||
45
|
||||
LABELS
|
||||
"integration-tests"
|
||||
DEPENDS
|
||||
rocprofv3-test-trace-hip-in-libraries-execute
|
||||
FAIL_REGULAR_EXPRESSION
|
||||
"AssertionError"
|
||||
ATTACHED_FILES_ON_FAIL
|
||||
"${VALIDATION_FILES}")
|
||||
@@ -0,0 +1,94 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import csv
|
||||
import pytest
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption(
|
||||
"--hsa-input",
|
||||
action="store",
|
||||
help="Path to HSA API tracing CSV file.",
|
||||
)
|
||||
parser.addoption(
|
||||
"--kernel-input",
|
||||
action="store",
|
||||
help="Path to kernel tracing CSV file.",
|
||||
)
|
||||
parser.addoption(
|
||||
"--memory-copy-input",
|
||||
action="store",
|
||||
help="Path to memory-copy tracing CSV file.",
|
||||
)
|
||||
parser.addoption(
|
||||
"--marker-input",
|
||||
action="store",
|
||||
help="Path to marker API tracing CSV file.",
|
||||
)
|
||||
parser.addoption(
|
||||
"--hip-input",
|
||||
action="store",
|
||||
help="Path to HIP runtime and compiler API tracing CSV file.",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def hsa_input_data(request):
|
||||
filename = request.config.getoption("--hsa-input")
|
||||
data = []
|
||||
with open(filename, "r") as inp:
|
||||
reader = csv.DictReader(inp)
|
||||
for row in reader:
|
||||
data.append(row)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def kernel_input_data(request):
|
||||
filename = request.config.getoption("--kernel-input")
|
||||
data = []
|
||||
with open(filename, "r") as inp:
|
||||
reader = csv.DictReader(inp)
|
||||
for row in reader:
|
||||
data.append(row)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def memory_copy_input_data(request):
|
||||
filename = request.config.getoption("--memory-copy-input")
|
||||
data = []
|
||||
with open(filename, "r") as inp:
|
||||
reader = csv.DictReader(inp)
|
||||
for row in reader:
|
||||
data.append(row)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def marker_input_data(request):
|
||||
filename = request.config.getoption("--marker-input")
|
||||
data = []
|
||||
with open(filename, "r") as inp:
|
||||
reader = csv.DictReader(inp)
|
||||
for row in reader:
|
||||
data.append(row)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def hip_input_data(request):
|
||||
filename = request.config.getoption("--hip-input")
|
||||
data = []
|
||||
if os.path.exists(filename):
|
||||
with open(filename, "r") as inp:
|
||||
reader = csv.DictReader(inp)
|
||||
for row in reader:
|
||||
data.append(row)
|
||||
|
||||
return data
|
||||
@@ -0,0 +1,142 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
import pytest
|
||||
|
||||
|
||||
class dim3(object):
|
||||
def __init__(self, x, y, z):
|
||||
self.x = int(x)
|
||||
self.y = int(y)
|
||||
self.z = int(z)
|
||||
|
||||
def as_tuple(self):
|
||||
return (self.x, self.y, self.z)
|
||||
|
||||
|
||||
def test_api_trace(hsa_input_data, hip_input_data):
|
||||
functions = []
|
||||
correlation_ids = []
|
||||
for row in hsa_input_data:
|
||||
assert row["Domain"] in (
|
||||
"HSA_CORE_API",
|
||||
"HSA_AMD_EXT_API",
|
||||
"HSA_IMAGE_EXT_API",
|
||||
"HSA_FINALIZE_EXT_API",
|
||||
)
|
||||
assert int(row["Process_Id"]) > 0
|
||||
assert int(row["Thread_Id"]) >= int(row["Process_Id"])
|
||||
assert int(row["End_Timestamp"]) >= int(row["Start_Timestamp"])
|
||||
functions.append(row["Function"])
|
||||
correlation_ids.append(int(row["Correlation_Id"]))
|
||||
|
||||
for row in hip_input_data:
|
||||
assert row["Domain"] in [
|
||||
"HIP_RUNTIME_API",
|
||||
"HIP_COMPILER_API",
|
||||
]
|
||||
assert int(row["Process_Id"]) > 0
|
||||
assert int(row["Thread_Id"]) == 0 or int(row["Thread_Id"]) >= int(
|
||||
row["Process_Id"]
|
||||
)
|
||||
assert int(row["End_Timestamp"]) >= int(row["Start_Timestamp"])
|
||||
functions.append(row["Function"])
|
||||
correlation_ids.append(int(row["Correlation_Id"]))
|
||||
|
||||
correlation_ids = sorted(list(set(correlation_ids)))
|
||||
|
||||
# all correlation ids are unique
|
||||
assert len(correlation_ids) == (len(hsa_input_data) + len(hip_input_data))
|
||||
# correlation ids are numbered from 1 to N
|
||||
assert correlation_ids[0] == 1
|
||||
assert correlation_ids[-1] == len(correlation_ids)
|
||||
|
||||
functions = list(set(functions))
|
||||
for itr in (
|
||||
"hsa_amd_memory_async_copy_on_engine",
|
||||
"hsa_agent_get_info",
|
||||
"hsa_agent_iterate_isas",
|
||||
"hsa_signal_create",
|
||||
"hsa_agent_get_info",
|
||||
"hsa_executable_symbol_get_info",
|
||||
):
|
||||
assert itr in functions
|
||||
if hip_input_data:
|
||||
for itr in (
|
||||
"hipGetLastError",
|
||||
"hipLaunchKernel",
|
||||
"hipStreamSynchronize",
|
||||
"hipMemcpyAsync",
|
||||
"hipFree",
|
||||
"hipStreamDestroy",
|
||||
"hipDeviceSynchronize",
|
||||
"hipDeviceReset",
|
||||
"hipSetDevice",
|
||||
):
|
||||
assert itr in functions
|
||||
|
||||
|
||||
def test_kernel_trace(kernel_input_data):
|
||||
valid_kernel_names = sorted(
|
||||
[
|
||||
"__amd_rocclr_fillBufferAligned",
|
||||
"(anonymous namespace)::transpose(int const*, int*, int, int)",
|
||||
"void (anonymous namespace)::addition_kernel<float>(float*, float const*, float const*, int, int)",
|
||||
"void (anonymous namespace)::divide_kernel<float>(float*, float const*, float const*, int, int)",
|
||||
"void (anonymous namespace)::multiply_kernel<float>(float*, float const*, float const*, int, int)",
|
||||
"void (anonymous namespace)::subtract_kernel<float>(float*, float const*, float const*, int, int)",
|
||||
]
|
||||
)
|
||||
|
||||
kernels = []
|
||||
for row in kernel_input_data:
|
||||
kernel_name = row["Kernel_Name"]
|
||||
|
||||
assert row["Kind"] == "KERNEL_DISPATCH"
|
||||
assert int(row["Agent_Id"]) > 0
|
||||
assert int(row["Queue_Id"]) > 0
|
||||
assert int(row["Kernel_Id"]) > 0
|
||||
assert int(row["Correlation_Id"]) > 0
|
||||
assert kernel_name in valid_kernel_names
|
||||
|
||||
if kernel_name not in kernels:
|
||||
kernels.append(kernel_name)
|
||||
|
||||
workgrp_size = dim3(
|
||||
row["Workgroup_Size_X"], row["Workgroup_Size_Y"], row["Workgroup_Size_Z"]
|
||||
)
|
||||
grid_size = dim3(row["Grid_Size_X"], row["Grid_Size_Y"], row["Grid_Size_Z"])
|
||||
|
||||
if kernel_name == "__amd_rocclr_fillBufferAligned":
|
||||
assert workgrp_size.as_tuple() > (1, 1, 1)
|
||||
assert grid_size.as_tuple() > (1, 1, 1)
|
||||
elif "transpose" in kernel_name:
|
||||
assert workgrp_size.as_tuple() == (32, 32, 1)
|
||||
assert grid_size.as_tuple() == (9920, 9920, 1)
|
||||
else:
|
||||
assert workgrp_size.as_tuple() == (64, 1, 1)
|
||||
assert grid_size.as_tuple() == (4096, 2048, 1)
|
||||
|
||||
assert int(row["End_Timestamp"]) >= int(row["Start_Timestamp"])
|
||||
|
||||
kernels = sorted(list(set(kernels)))
|
||||
assert kernels == valid_kernel_names
|
||||
|
||||
|
||||
def test_memory_copy_trace(memory_copy_input_data):
|
||||
for row in memory_copy_input_data:
|
||||
assert row["Kind"] == "MEMORY_COPY"
|
||||
assert row["Direction"] in ("HOST_TO_DEVICE", "DEVICE_TO_HOST")
|
||||
if row["Direction"] == "HOST_TO_DEVICE":
|
||||
assert int(row["Source_Agent_Id"]) == 0
|
||||
elif row["Direction"] == "DEVICE_TO_HOST":
|
||||
assert int(row["Destination_Agent_Id"]) == 0
|
||||
assert int(row["Correlation_Id"]) > 0
|
||||
assert int(row["End_Timestamp"]) >= int(row["Start_Timestamp"])
|
||||
|
||||
assert len(memory_copy_input_data) == 120
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit_code = pytest.main(["-x", __file__] + sys.argv[1:])
|
||||
sys.exit(exit_code)
|
||||
@@ -40,7 +40,7 @@ set(cc-tracing-env
|
||||
set_tests_properties(
|
||||
rocprofv3-test-tracing-plus-cc-execute
|
||||
PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT "${cc-tracing-env}"
|
||||
FAIL_REGULAR_EXPRESSION "threw an exception")
|
||||
FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
|
||||
|
||||
add_test(NAME rocprofv3-test-tracing-plus-cc-validate
|
||||
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py
|
||||
@@ -50,4 +50,4 @@ set_tests_properties(
|
||||
rocprofv3-test-tracing-plus-cc-validate
|
||||
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS
|
||||
rocprofv3-test-tracing-plus-cc-execute FAIL_REGULAR_EXPRESSION
|
||||
"threw an exception")
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
|
||||
|
||||
@@ -34,7 +34,7 @@ def test_validate_counter_collection_plus_tracing(input_dir: pd.DataFrame):
|
||||
with open(file_path, "r") as file:
|
||||
df = pd.read_csv(file)
|
||||
# check if either kernel-name/FUNCTION is present
|
||||
assert "Kernel-Name" in df.columns or "Function" in df.columns
|
||||
assert "Kernel_Name" in df.columns or "Function" in df.columns
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -23,7 +23,7 @@ def test_hsa_api_trace(hsa_input_data):
|
||||
correlation_ids = sorted(list(set(correlation_ids)))
|
||||
|
||||
hsa_api_calls_offset = 2 # roctxRangePush is first
|
||||
num_marker_api_calls = 6 # seven marker API calls, only six entries in
|
||||
num_marker_api_calls = 7 # seven marker API calls, only six entries in
|
||||
# marker csv data because roctxRangePush + roctxRangePop is one entry
|
||||
|
||||
# all correlation ids are unique
|
||||
|
||||
@@ -526,19 +526,16 @@ counter_collection_buffered(rocprofiler_context_id_t, /*context*/
|
||||
}
|
||||
|
||||
void
|
||||
dispatch_callback(rocprofiler_queue_id_t, /*queue_id*/
|
||||
const rocprofiler_agent_t* agent,
|
||||
rocprofiler_correlation_id_t, /*correlation_id*/
|
||||
const hsa_kernel_dispatch_packet_t*, /*dispatch_packet*/
|
||||
uint64_t, /*kernel_id*/
|
||||
void* /*callback_data_args*/,
|
||||
rocprofiler_profile_config_id_t* config)
|
||||
dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
|
||||
rocprofiler_profile_config_id_t* config,
|
||||
rocprofiler_user_data_t* /*user_data*/,
|
||||
void* /*callback_data_args*/)
|
||||
{
|
||||
static std::shared_mutex m_mutex = {};
|
||||
static std::unordered_map<uint64_t, rocprofiler_profile_config_id_t> profile_cache = {};
|
||||
|
||||
auto search_cache = [&]() {
|
||||
if(auto pos = profile_cache.find(agent->id.handle); pos != profile_cache.end())
|
||||
if(auto pos = profile_cache.find(dispatch_data.agent_id.handle); pos != profile_cache.end())
|
||||
{
|
||||
*config = pos->second;
|
||||
return true;
|
||||
@@ -565,7 +562,7 @@ dispatch_callback(rocprofiler_queue_id_t, /*queue_id*/
|
||||
|
||||
// Iterate through the agents and get the counters available on that agent
|
||||
ROCPROFILER_CALL(rocprofiler_iterate_agent_supported_counters(
|
||||
agent->id,
|
||||
dispatch_data.agent_id,
|
||||
[]([[maybe_unused]] rocprofiler_agent_id_t id,
|
||||
rocprofiler_counter_id_t* counters,
|
||||
size_t num_counters,
|
||||
@@ -600,11 +597,12 @@ dispatch_callback(rocprofiler_queue_id_t, /*queue_id*/
|
||||
|
||||
// Create a colleciton profile for the counters
|
||||
rocprofiler_profile_config_id_t profile;
|
||||
ROCPROFILER_CALL(rocprofiler_create_profile_config(
|
||||
agent->id, collect_counters.data(), collect_counters.size(), &profile),
|
||||
"Could not construct profile cfg");
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_create_profile_config(
|
||||
dispatch_data.agent_id, collect_counters.data(), collect_counters.size(), &profile),
|
||||
"Could not construct profile cfg");
|
||||
|
||||
profile_cache.emplace(agent->id.handle, profile);
|
||||
profile_cache.emplace(dispatch_data.agent_id.handle, profile);
|
||||
// Return the profile to collect those counters for this dispatch
|
||||
*config = profile;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user