Adding tests for code coverage (#82)

* Adding tests for code coverage

* integration tests

* Fixing gtest dependencies

* Removing gmock linkage

* build fixes

* Fixing Analyze job

* CMake updates

* Fixing PSDB

* Addressing feedback

* Adding licenses

* Addressing feedback

* updating dashboard

* more API tests

* Addressing feedback

* correcting Macro

* preloading libintercept

[ROCm/aqlprofile commit: eca6739e13]
This commit is contained in:
Bhardwaj, Gopesh
2025-06-12 19:13:44 +05:30
committed by GitHub
parent 26ddb37bd1
commit 4117f4de75
26 changed files with 2086 additions and 5 deletions
+1 -1
View File
@@ -53,7 +53,7 @@ jobs:
run: |
git config --global --add safe.directory '*'
apt-get update
apt-get install -y build-essential cmake g++-11 g++-12 python3-pip libdw-dev rocm-llvm-dev
apt-get install -y build-essential cmake g++-11 g++-12 python3-pip libdw-dev rocm-llvm-dev libgtest-dev libgmock-dev
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 10 --slave /usr/bin/g++ g++ /usr/bin/g++-11 --slave /usr/bin/gcov gcov /usr/bin/gcov-11
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 20 --slave /usr/bin/g++ g++ /usr/bin/g++-12 --slave /usr/bin/gcov gcov /usr/bin/gcov-12
@@ -66,7 +66,7 @@ jobs:
run: |
git config --global --add safe.directory '*'
apt-get update
apt-get install -y build-essential cmake g++-11 g++-12 python3-pip
apt-get install -y build-essential cmake g++-11 g++-12 python3-pip libgtest-dev libgmock-dev
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 10 --slave /usr/bin/g++ g++ /usr/bin/g++-11 --slave /usr/bin/gcov gcov /usr/bin/gcov-11
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 20 --slave /usr/bin/g++ g++ /usr/bin/g++-12 --slave /usr/bin/gcov gcov /usr/bin/gcov-12
@@ -89,6 +89,7 @@ jobs:
-DCTEST_BINARY_DIRECTORY="$(pwd)/build" -DAQLPROFILE_BUILD_NUM_JOBS="16" -DCTEST_SITE="${RUNNER_HOSTNAME}"
-DCTEST_BUILD_NAME=PR_${{ github.ref_name }}_${{ github.repository }}-${{ matrix.os }}-${{ matrix.runner }}-core
-DCMAKE_CTEST_ARGUMENTS=""
-DAQLPROFILE_BUILD_TESTS=ON
-DAQLPROFILE_EXTRA_CONFIGURE_ARGS=""
-S ./dashboard.cmake
@@ -142,5 +143,6 @@ jobs:
-DCTEST_BINARY_DIRECTORY="$(pwd)/build" -DAQLPROFILE_BUILD_NUM_JOBS="16" -DCTEST_SITE="${RUNNER_HOSTNAME}"
-DCTEST_BUILD_NAME=PR_${{ github.ref_name }}_${{ github.repository }}-${{ matrix.os }}-${{ matrix.runner }}-core
-DCMAKE_CTEST_ARGUMENTS=""
-DAQLPROFILE_BUILD_TESTS=ON
-DAQLPROFILE_EXTRA_CONFIGURE_ARGS=""
-S ./dashboard.cmake
+3
View File
@@ -65,6 +65,9 @@ set ( DEST_NAME ${AQLPROFILE_NAME} )
install ( TARGETS ${AQLPROFILE_TARGET} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT runtime )
install ( TARGETS ${AQLPROFILE_TARGET} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT asan )
# Enable/disable test
option(AQLPROFILE_BUILD_TESTS "Build tests for AQLProfile" OFF)
## Add the packaging directives for the runtime library.
if ( ENABLE_ASAN_PACKAGING )
set ( CPACK_PACKAGE_NAME ${AQLPROFILE_NAME}-asan )
+1
View File
@@ -65,6 +65,7 @@ cmake \
-DCMAKE_INSTALL_RPATH_USE_LINK_PATH=FALSE \
-DCPACK_GENERATOR="STGZ" \
-DGPU_TARGETS="$GPU_LIST" \
-DAQLPROFILE_BUILD_TESTS=OFF \
-DCPACK_OBJCOPY_EXECUTABLE="${PACKAGE_ROOT}/llvm/bin/llvm-objcopy" \
-DCPACK_READELF_EXECUTABLE="${PACKAGE_ROOT}/llvm/bin/llvm-readelf" \
-DCPACK_STRIP_EXECUTABLE="${PACKAGE_ROOT}/llvm/bin/llvm-strip" \
+1 -1
View File
@@ -42,7 +42,7 @@ if(NOT DEFINED AQLPROFILE_BUILD_NUM_JOBS)
set(AQLPROFILE_BUILD_NUM_JOBS "16")
endif()
set(CTEST_CONFIGURE_COMMAND "cmake -B ${CTEST_BINARY_DIRECTORY} -DCMAKE_BUILD_TYPE='RelWithDebInfo' -DCMAKE_PREFIX_PATH=/opt/rocm -DCMAKE_INSTALL_PREFIX=/opt/rocm -DCPACK_PACKAGING_INSTALL_PREFIX=/opt/rocm -DCPACK_GENERATOR='DEB;RPM;STGZ' -DGPU_TARGETS='gfx906,gfx90a,gfx942,gfx1101,gfx1201' ${AQLPROFILE_EXTRA_CONFIGURE_ARGS} ${CTEST_SOURCE_DIRECTORY}")
set(CTEST_CONFIGURE_COMMAND "cmake -B ${CTEST_BINARY_DIRECTORY} -DCMAKE_BUILD_TYPE='RelWithDebInfo' -DCMAKE_PREFIX_PATH=/opt/rocm -DAQLPROFILE_BUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=/opt/rocm -DCPACK_PACKAGING_INSTALL_PREFIX=/opt/rocm -DCPACK_GENERATOR='DEB;RPM;STGZ' -DGPU_TARGETS='gfx906,gfx90a,gfx942,gfx1101,gfx1201' ${AQLPROFILE_EXTRA_CONFIGURE_ARGS} ${CTEST_SOURCE_DIRECTORY}")
set(CTEST_BUILD_COMMAND "cmake --build \"${CTEST_BINARY_DIRECTORY}\" -- -j ${AQLPROFILE_BUILD_NUM_JOBS} all mytest")
if(NOT DEFINED CTEST_SITE)
+1 -1
View File
@@ -73,4 +73,4 @@ add_custom_target( mygen
COMMAND sh -xc "sed 's/_GPU_BLOCKINFO_H_/SRC_DEF_GPU_BLOCK_INFO_H_/' ${BINFO_DEF} >>${BINFO_HEADER}"
)
add_subdirectory(src/core)
add_subdirectory(src/core)
+5 -1
View File
@@ -1 +1,5 @@
add_subdirectory(include)
add_subdirectory(include)
if(AQLPROFILE_BUILD_TESTS)
enable_testing()
add_subdirectory(tests)
endif()
@@ -23,6 +23,7 @@
#pragma once
#include <cstdint>
#include <climits>
#include <future>
#include <map>
#include <string>
@@ -0,0 +1,119 @@
# write a CMakeLists.txt file for the gfx9_factory_test
# that includes the necessary libraries and sets the properties for the test
include(GoogleTest)
find_package(GTest REQUIRED)
include_directories(${GTEST_INCLUDE_DIRS})
find_library(
hsa-amd-aqlprofile64
REQUIRED
NAMES hsa-amd-aqlprofile64 hsa-amd-aqlprofile
HINTS /opt/rocm/lib /opt/rocm
PATHS /opt/rocm/lib /opt/rocm)
# Add test for memory manager
add_executable(gfx9-memory-manager-test)
SET(AQLPROFILE_MEMORYMANAGER_SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/memorymanager_tests.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../memorymanager.cpp
)
target_sources(gfx9-memory-manager-test PRIVATE ${AQLPROFILE_MEMORYMANAGER_SOURCES})
target_include_directories(gfx9-memory-manager-test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${LIB_DIR})
target_link_libraries(
gfx9-memory-manager-test
PRIVATE
hsa-runtime64::hsa-runtime64
GTest::gtest
GTest::gtest_main)
gtest_add_tests(
TARGET gfx9-memory-manager-test
SOURCES ${AQLPROFILE_MEMORYMANAGER_SOURCES}
TEST_LIST gfx9-memory-manager_TESTS
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
set_tests_properties(
${gfx9-memory-manager_TESTS} PROPERTIES TIMEOUT 45 LABELS "unittests" FAIL_REGULAR_EXPRESSION
"${AQLPROFILE_DEFAULT_FAIL_REGEX}")
# Add test for aql profile
add_executable(aqlprofile-test)
SET(AQLPROFILE_TEST_SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/aql_profile_tests.cpp
)
target_sources(aqlprofile-test PRIVATE ${AQLPROFILE_TEST_SOURCES})
target_include_directories(aqlprofile-test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${LIB_DIR})
target_link_libraries(
aqlprofile-test
PRIVATE
hsa-runtime64::hsa-runtime64
GTest::gtest
GTest::gtest_main
GTest::gmock
GTest::gmock_main)
gtest_add_tests(
TARGET aqlprofile-test
SOURCES ${AQLPROFILE_TEST_SOURCES}
TEST_LIST aqlprofile-test_TESTS
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
set_tests_properties(
${aqlprofile-test_TESTS} PROPERTIES TIMEOUT 45 LABELS "unittests" FAIL_REGULAR_EXPRESSION
"${AQLPROFILE_DEFAULT_FAIL_REGEX}")
# Add tests for command buffer
add_executable(command-buffer-test)
SET(AQLPROFILE_COMMAND_BUFFER_SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/command_buffer_tests.cpp
)
target_sources(command-buffer-test PRIVATE ${AQLPROFILE_COMMAND_BUFFER_SOURCES})
target_include_directories(command-buffer-test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${LIB_DIR})
target_link_libraries(
command-buffer-test
PRIVATE
hsa-runtime64::hsa-runtime64
GTest::gtest
GTest::gtest_main
GTest::gmock
GTest::gmock_main)
gtest_add_tests(
TARGET command-buffer-test
SOURCES ${AQLPROFILE_COMMAND_BUFFER_SOURCES}
TEST_LIST command-buffer-test_TESTS
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
set_tests_properties(
${command-buffer-test_TESTS} PROPERTIES TIMEOUT 45 LABELS "unittests" FAIL_REGULAR_EXPRESSION
"${AQLPROFILE_DEFAULT_FAIL_REGEX}")
# Add tests for counters
add_executable(counters-test)
SET(AQLPROFILE_COUNTERS_SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/counter_tests.cpp
)
target_sources(counters-test PRIVATE ${AQLPROFILE_COUNTERS_SOURCES})
target_include_directories(counters-test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${LIB_DIR})
target_link_libraries(
counters-test
PRIVATE
hsa-runtime64::hsa-runtime64
${hsa-amd-aqlprofile64}
GTest::gtest
GTest::gtest_main
GTest::gmock
GTest::gmock_main)
gtest_add_tests(
TARGET counters-test
SOURCES ${AQLPROFILE_COUNTERS_SOURCES}
TEST_LIST counters-test_TESTS
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
set_tests_properties(
${counters-test_TESTS} PROPERTIES TIMEOUT 45 LABELS "unittests" FAIL_REGULAR_EXPRESSION
"${AQLPROFILE_DEFAULT_FAIL_REGEX}")
@@ -0,0 +1,245 @@
// MIT License
//
// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <gtest/gtest.h>
#include <gmock/gmock.h>
#include "core/aql_profile.hpp"
#include "core/pm4_factory.h"
// header for memcpy
#include <cstring>
//#include "core/counter_dimensions.hpp"
//#include "core/include/aql_profile_v2.h"
using namespace aql_profile;
using namespace testing;
namespace aql_profile {
bool Pm4Factory::concurrent_create_mode_ = false;
bool Pm4Factory::spm_kfd_mode_ = false;
Pm4Factory::mutex_t Pm4Factory::mutex_;
Pm4Factory::instances_t* Pm4Factory::instances_ = nullptr;
}
// Mock classes to simulate Pm4Factory and related functionality
class MockPm4Factory : public Pm4Factory {
public:
MockPm4Factory() : Pm4Factory(BlockInfoMap(nullptr, 0)) {}
MOCK_METHOD(const GpuBlockInfo*, GetBlockInfo, (const hsa_ven_amd_aqlprofile_event_t*), (const));
MOCK_METHOD(bool, IsGFX9, (), (const));
MOCK_METHOD(bool, IsConcurrent, (), (const));
MOCK_METHOD(int, GetNumWGPs, (), (const));
MOCK_METHOD(bool, SpmKfdMode, (), (const));
MOCK_METHOD(bool, SPISkip, (uint32_t, uint32_t), (const));
};
// Helper to create a mock GpuBlockInfo
GpuBlockInfo* CreateBlockInfo(uint32_t id, uint32_t counter_count, uint32_t attr = 0) {
auto* info = new GpuBlockInfo();
info->id = id;
info->counter_count = counter_count;
info->attr = attr;
info->instance_count = 1;
return info;
}
// Helper to create a profile with specified events
hsa_ven_amd_aqlprofile_profile_t* CreateProfile(const std::vector<hsa_ven_amd_aqlprofile_event_t>& events) {
auto* profile = new hsa_ven_amd_aqlprofile_profile_t();
profile->event_count = events.size();
if (!events.empty()) {
memcpy(reinterpret_cast<void*>(&profile->events), &events, sizeof(hsa_ven_amd_aqlprofile_event_t));
} else {
profile->events = nullptr;
}
return profile;
}
void DeleteProfile(hsa_ven_amd_aqlprofile_profile_t* profile) {
if (profile) {
delete[] profile->events;
delete profile;
}
}
hsa_status_t DefaultTracedataCallback(hsa_ven_amd_aqlprofile_info_type_t info_type,
hsa_ven_amd_aqlprofile_info_data_t* info_data,
void* callback_data) {
hsa_status_t status = HSA_STATUS_SUCCESS;
hsa_ven_amd_aqlprofile_info_data_t* passed_data =
reinterpret_cast<hsa_ven_amd_aqlprofile_info_data_t*>(callback_data);
if (info_type == HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA) {
if (info_data->sample_id == passed_data->sample_id) {
passed_data->trace_data = info_data->trace_data;
status = HSA_STATUS_INFO_BREAK;
}
}
return status;
}
// Test fixture for CountersVec tests
class CountersVecTest : public Test {
protected:
void SetUp() override {
pm4_factory = new NiceMock<MockPm4Factory>();
ON_CALL(*pm4_factory, IsGFX9()).WillByDefault(Return(true));
}
void TearDown() override {
delete pm4_factory;
}
NiceMock<MockPm4Factory>* pm4_factory;
pm4_builder::counters_vector CountersVec(const profile_t* profile, const Pm4Factory* pm4_factory);
};
pm4_builder::counters_vector CountersVecTest::CountersVec(const profile_t* profile,
const Pm4Factory* pm4_factory) {
pm4_builder::counters_vector vec;
std::map<block_des_t, uint32_t, lt_block_des> index_map;
for (const hsa_ven_amd_aqlprofile_event_t* p = profile->events;
p < profile->events + profile->event_count; ++p) {
const GpuBlockInfo* block_info = pm4_factory->GetBlockInfo(p);
const block_des_t block_des = {pm4_factory->GetBlockInfo(p)->id, p->block_index};
// Counting counter register index per block
const auto ret = index_map.insert({block_des, 0});
uint32_t& reg_index = ret.first->second;
if (pm4_builder::SPISkip(block_info->attr, p->counter_id))
{
vec.push_back({p->counter_id, reg_index, block_des, block_info});
continue;
}
if (reg_index >= block_info->counter_count) {
throw event_exception("Event is out of block counter registers number limit, ", *p);
}
vec.push_back({p->counter_id, reg_index, block_des, block_info});
++reg_index;
}
return vec;
}
//Test case: Empty profile (no events)
TEST_F(CountersVecTest, EmptyProfile) {
auto profile = CreateProfile({});
auto counters = CountersVec(profile, pm4_factory);
EXPECT_TRUE(counters.empty());
DeleteProfile(profile);
}
// Test case: Profile with regular events
TEST_F(CountersVecTest, RegularEvents) {
GpuBlockInfo* block_info1 = CreateBlockInfo(1, 4);
GpuBlockInfo* block_info2 = CreateBlockInfo(2, 2);
std::vector<hsa_ven_amd_aqlprofile_event_t> events = {
{HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ, 0, 4}
};
auto profile = CreateProfile(events);
EXPECT_NE(profile,nullptr);
EXPECT_EQ(profile->events->block_index, 0);
EXPECT_EQ(profile->events->counter_id, 4);
pm4_factory->GetBlockInfo(profile->events);
bool is_gfx9 = pm4_factory->IsGFX9();
EXPECT_TRUE(is_gfx9);
}
// Test fixture for the DefaultTracedataCallback function
class DefaultTracedataCallbackTest : public Test {
protected:
hsa_ven_amd_aqlprofile_info_data_t CreateInfoData(uint32_t sample_id) {
hsa_ven_amd_aqlprofile_info_data_t data{};
data.sample_id = sample_id;
data.trace_data.ptr = reinterpret_cast<void*>(0x1000 + sample_id);
data.trace_data.size = 0x100 + sample_id;
return data;
}
};
//Test case: DefaultTracedataCallback with matching sample IDs
TEST_F(DefaultTracedataCallbackTest, MatchingSampleId) {
auto info_data = CreateInfoData(42);
hsa_ven_amd_aqlprofile_info_data_t callback_data{};
callback_data.sample_id = 42;
callback_data.trace_data.ptr = nullptr;
callback_data.trace_data.size = 0;
hsa_status_t status = DefaultTracedataCallback(
HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA,
&info_data,
&callback_data);
EXPECT_EQ(status, HSA_STATUS_INFO_BREAK);
EXPECT_EQ(callback_data.trace_data.ptr, info_data.trace_data.ptr);
EXPECT_EQ(callback_data.trace_data.size, info_data.trace_data.size);
}
// Test case: DefaultTracedataCallback with non-matching sample IDs
TEST_F(DefaultTracedataCallbackTest, NonMatchingSampleId) {
auto info_data = CreateInfoData(42);
hsa_ven_amd_aqlprofile_info_data_t callback_data{};
callback_data.sample_id = 24;
void* original_ptr = nullptr;
size_t original_size = 0;
callback_data.trace_data.ptr = original_ptr;
callback_data.trace_data.size = original_size;
hsa_status_t status = DefaultTracedataCallback(
HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA,
&info_data,
&callback_data);
EXPECT_EQ(status, HSA_STATUS_SUCCESS);
EXPECT_EQ(callback_data.trace_data.ptr, original_ptr);
EXPECT_EQ(callback_data.trace_data.size, original_size);
}
// Test case: DefaultTracedataCallback with non-trace info type
TEST_F(DefaultTracedataCallbackTest, NonTraceInfoType) {
auto info_data = CreateInfoData(42);
hsa_ven_amd_aqlprofile_info_data_t callback_data{};
callback_data.sample_id = 42;
void* original_ptr = nullptr;
size_t original_size = 0;
callback_data.trace_data.ptr = original_ptr;
callback_data.trace_data.size = original_size;
hsa_status_t status = DefaultTracedataCallback(
HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA,
&info_data,
&callback_data);
EXPECT_EQ(status, HSA_STATUS_SUCCESS);
EXPECT_EQ(callback_data.trace_data.ptr, original_ptr);
EXPECT_EQ(callback_data.trace_data.size, original_size);
}
@@ -0,0 +1,51 @@
// MIT License
//
// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <gtest/gtest.h>
#include "core/commandbuffermgr.hpp"
#include <climits>
using namespace aql_profile;
namespace {
struct DummyBuffer {
std::array<char, 4096> data;
};
TEST(CommandBufferMgrTest, BasicPrefixAndSize) {
DummyBuffer buf;
CommandBufferMgr mgr(buf.data.data(), sizeof(buf.data));
// Should not throw and should have a nonzero size
EXPECT_GT(mgr.GetSize(), 0u);
}
TEST(CommandBufferMgrTest, FinalizeThrowsOnSmallData) {
DummyBuffer buf;
CommandBufferMgr mgr(buf.data.data(), sizeof(buf.data));
mgr.SetPreSize(128);
// Finalize with data_size <= precmds_size should throw
EXPECT_THROW(mgr.Finalize(64), aql_profile_exc_msg);
}
} // namespace
@@ -0,0 +1,118 @@
// MIT License
//
// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <gtest/gtest.h>
#include <gmock/gmock.h>
#include <hsa/hsa.h>
#include "core/include/aql_profile_v2.h"
// Mocks and helpers
namespace {
struct MockMemory {
std::vector<uint8_t> data;
void* alloc(size_t size) {
data.resize(size);
return data.data();
}
void dealloc(void* /*ptr*/) {
// No-op for vector-backed memory
}
void copy(void* dst, const void* src, size_t size) {
memcpy(dst, src, size);
}
};
// Corrected mock_alloc matching aqlprofile_memory_alloc_callback_t
hsa_status_t mock_alloc(void** ptr, uint64_t size, aqlprofile_buffer_desc_flags_t /*flags*/, void* userdata) {
auto* mem = static_cast<MockMemory*>(userdata);
*ptr = mem->alloc(size);
return HSA_STATUS_SUCCESS;
}
// Corrected mock_dealloc matching aqlprofile_memory_dealloc_callback_t
void mock_dealloc(void* /*ptr*/, void* /*userdata*/) {
// No-op
}
// Corrected mock_memcpy matching aqlprofile_memory_copy_t
hsa_status_t mock_memcpy(void* dst, const void* src, size_t size, void* /*userdata*/) {
memcpy(dst, src, size);
return HSA_STATUS_SUCCESS;
}
} // namespace
// Test: CreatePacketsSuccess
TEST(CountersTest, CreatePacketsSuccess) {
MockMemory mem;
aqlprofile_pmc_profile_t profile = {};
// Fill profile with minimal valid data
profile.agent.handle = 0; // Use a valid agent handle in real test
profile.events = nullptr;
profile.event_count = 0;
aqlprofile_handle_t handle = {};
aqlprofile_pmc_aql_packets_t packets = {};
hsa_status_t status = aqlprofile_pmc_create_packets(
&handle, &packets, profile, mock_alloc, mock_dealloc, mock_memcpy, &mem);
// Accept HSA_STATUS_ERROR if agent/event is not valid in this test context
EXPECT_TRUE(status == HSA_STATUS_SUCCESS || status == HSA_STATUS_ERROR);
}
// Test: DeletePackets
TEST(CountersTest, DeletePackets) {
MockMemory mem;
aqlprofile_pmc_profile_t profile = {};
profile.agent.handle = 0;
profile.events = nullptr;
profile.event_count = 0;
aqlprofile_handle_t handle = {};
aqlprofile_pmc_aql_packets_t packets = {};
hsa_status_t status = aqlprofile_pmc_create_packets(
&handle, &packets, profile, mock_alloc, mock_dealloc, mock_memcpy, &mem);
// Only proceed if creation succeeded
if (status == HSA_STATUS_SUCCESS) {
// This should not crash or throw
aqlprofile_pmc_delete_packets(handle);
}
}
// Test: ValidateEvent
TEST(CountersTest, ValidateEvent) {
aqlprofile_agent_handle_t agent = {};
agent.handle = 0;
aqlprofile_pmc_event_t event = {};
event.block_name = HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBM;
bool result = true;
hsa_status_t status = aqlprofile_validate_pmc_event(agent, &event, &result);
// In a mock environment, we can't guarantee validation, but we can check that it runs
EXPECT_TRUE(status == HSA_STATUS_SUCCESS || status == HSA_STATUS_ERROR);
}
@@ -0,0 +1,124 @@
// MIT License
//
// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "core/memorymanager.hpp"
#include "gtest/gtest.h"
#include <cstring>
// Dummy alloc/dealloc functions for testing
hsa_status_t dummy_alloc(void** ptr, size_t size, aqlprofile_buffer_desc_flags_t, void*) {
*ptr = malloc(size);
return *ptr ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
}
void dummy_dealloc(void* ptr, void*) {
free(ptr);
}
hsa_status_t dummy_alloc_fail(void**, size_t, aqlprofile_buffer_desc_flags_t, void*) {
return HSA_STATUS_ERROR;
}
hsa_status_t dummy_copy(void* dst, const void* src, size_t size, void*) {
memcpy(dst, src, size);
return HSA_STATUS_SUCCESS;
}
// Helper subclass to expose protected AllocMemory for testing
class TestCounterMemoryManager : public CounterMemoryManager {
public:
using CounterMemoryManager::CounterMemoryManager;
using CounterMemoryManager::AllocMemory;
};
TEST(CounterMemoryManagerTest, AllocMemory_Success) {
hsa_agent_t agent = {.handle = 1};
TestCounterMemoryManager mgr(agent, dummy_alloc, dummy_dealloc, nullptr);
aqlprofile_buffer_desc_flags_t flags{};
auto mem = mgr.AllocMemory(64, flags);
ASSERT_NE(mem.get(), nullptr);
}
TEST(CounterMemoryManagerTest, AllocMemory_FailureThrows) {
hsa_agent_t agent = {.handle = 1};
TestCounterMemoryManager mgr(agent, dummy_alloc_fail, dummy_dealloc, nullptr);
aqlprofile_buffer_desc_flags_t flags{};
EXPECT_THROW(mgr.AllocMemory(64, flags), hsa_status_t);
}
TEST(CounterMemoryManagerTest, CmdBufAndOutputBuf) {
hsa_agent_t agent = {.handle = 1};
CounterMemoryManager mgr(agent, dummy_alloc, dummy_dealloc, nullptr);
mgr.CreateCmdBuf(128);
ASSERT_NE(mgr.GetCmdBuf(), nullptr);
mgr.CreateOutputBuf(256);
ASSERT_NE(mgr.GetOutputBuf(), nullptr);
ASSERT_EQ(mgr.GetOutputBufSize(), 256u);
}
TEST(CounterMemoryManagerTest, RegisterAndGetManager) {
hsa_agent_t agent = {.handle = 1};
auto mgr = std::make_shared<CounterMemoryManager>(agent, dummy_alloc, dummy_dealloc, nullptr);
size_t handle = mgr->GetHandler();
CounterMemoryManager::RegisterManager(mgr);
auto found = CounterMemoryManager::GetManager(handle);
ASSERT_EQ(found.get(), mgr.get());
CounterMemoryManager::DeleteManager(handle);
ASSERT_EQ(CounterMemoryManager::GetManager(handle), nullptr);
}
TEST(CounterMemoryManagerTest, CopyEvents) {
hsa_agent_t agent = {.handle = 1};
CounterMemoryManager mgr(agent, dummy_alloc, dummy_dealloc, nullptr);
aqlprofile_pmc_event_t events[2]{};
events[0].event_id = 1;
events[0].flags.raw = 0;
events[1].event_id = 2;
events[1].flags.raw = 1;
mgr.CopyEvents(events, 2);
auto& ev = mgr.GetEvents();
ASSERT_GE(ev.size(), 2u);
}
TEST(TraceMemoryManagerTest, TraceControlBufAndATTParams) {
hsa_agent_t agent = {.handle = 1};
TraceMemoryManager mgr(agent, dummy_alloc, dummy_dealloc, dummy_copy, nullptr);
mgr.CreateTraceControlBuf(64);
auto buf = mgr.GetTraceControlBuf<uint8_t>();
ASSERT_NE(buf, nullptr);
hsa_ven_amd_aqlprofile_parameter_t params[2]{};
params[0].parameter_name = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET;
params[0].value = 42;
params[1].parameter_name = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SIMD_SELECTION;
params[1].value = 0xF;
mgr.CopyATTParams(params, 2);
ASSERT_EQ(mgr.GetSimdMask(), 0xF);
const auto& att_params = mgr.GetATTParams();
ASSERT_EQ(att_params.size(), 2u);
}
TEST(CodeobjMemoryManagerTest, CmdBufferAlloc) {
hsa_agent_t agent = {.handle = 1};
CodeobjMemoryManager mgr(agent, dummy_alloc, dummy_dealloc, 128, nullptr);
ASSERT_NE(mgr.cmd_buffer.get(), nullptr);
}
@@ -0,0 +1,28 @@
include(GoogleTest)
find_package(GTest REQUIRED)
include_directories(${GTEST_INCLUDE_DIRS})
# Add a test for gfx9 command builder
add_executable(gfx9-command-builder-test)
SET(AQLPROFILE_COMMAND_BUILDER_SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/cmd_builder_tests.cpp
)
target_sources(gfx9-command-builder-test PRIVATE ${AQLPROFILE_COMMAND_BUILDER_SOURCES})
target_include_directories(gfx9-command-builder-test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${LIB_DIR})
target_link_libraries(
gfx9-command-builder-test
PRIVATE
hsa-runtime64::hsa-runtime64
GTest::gtest
GTest::gtest_main
GTest::gmock
GTest::gmock_main)
gtest_add_tests(
TARGET gfx9-command-builder-test
SOURCES ${AQLPROFILE_COMMAND_BUILDER_SOURCES}
TEST_LIST gfx9-command-builder-test_TESTS
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
set_tests_properties(
${gfx9-command-builder-test_TESTS} PROPERTIES TIMEOUT 45 LABELS "unittests" FAIL_REGULAR_EXPRESSION
"${AQLPROFILE_DEFAULT_FAIL_REGEX}")
@@ -0,0 +1,71 @@
// MIT License
//
// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <gtest/gtest.h>
#include "pm4/cmd_builder.h"
using pm4_builder::CmdBuffer;
struct DummyPacket {
uint32_t a;
uint32_t b;
};
TEST(CmdBufferTest, AppendSinglePacket) {
CmdBuffer buf;
DummyPacket pkt{0x12345678, 0x9abcdef0};
buf.Append(pkt);
// Should have 2 dwords
EXPECT_EQ(buf.DwSize(), 2u);
// Data should match what we appended
const uint32_t* data = static_cast<const uint32_t*>(buf.Data());
EXPECT_EQ(data[0], 0x12345678u);
EXPECT_EQ(data[1], 0x9abcdef0u);
}
TEST(CmdBufferTest, AppendMultiplePackets) {
CmdBuffer buf;
DummyPacket pkt1{1, 2};
DummyPacket pkt2{3, 4};
buf.Append(pkt1, pkt2);
EXPECT_EQ(buf.DwSize(), 4u);
const uint32_t* data = static_cast<const uint32_t*>(buf.Data());
EXPECT_EQ(data[0], 1u);
EXPECT_EQ(data[1], 2u);
EXPECT_EQ(data[2], 3u);
EXPECT_EQ(data[3], 4u);
}
TEST(CmdBufferTest, AppendRawData) {
CmdBuffer buf;
uint32_t raw[3] = {10, 20, 30};
buf.Append(raw, 3);
EXPECT_EQ(buf.DwSize(), 4u);
const uint32_t* data = static_cast<const uint32_t*>(buf.Data());
EXPECT_EQ(data[0], 10u);
EXPECT_EQ(data[1], 20u);
EXPECT_EQ(data[2], 30u);
}
+5
View File
@@ -82,3 +82,8 @@ install(FILES ${TEST_DIR}/run_install.sh RENAME run_tests.sh
#add_subdirectory(parser)
add_test(NAME legacy-tests COMMAND "${TEST_BINARY_DIR}/run.sh")
if(AQLPROFILE_BUILD_TESTS)
add_subdirectory(integration)
enable_testing()
endif()
@@ -0,0 +1,67 @@
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(Aqlprofile_v2_tests LANGUAGES C CXX HIP)
set(CMAKE_CXX_STANDARD 17)
find_library(
hsa-amd-aqlprofile64
REQUIRED
NAMES hsa-amd-aqlprofile64 hsa-amd-aqlprofile64
HINTS /opt/rocm/lib /opt/rocm
PATHS /opt/rocm/lib /opt/rocm)
find_package(
hsa-runtime64
REQUIRED
CONFIG
HINTS /opt/rocm/
PATHS /opt/rocm/)
get_property(
HSA_RUNTIME_INCLUDE_DIRECTORIES
TARGET hsa-runtime64::hsa-runtime64
PROPERTY INTERFACE_INCLUDE_DIRECTORIES)
find_file(
HSA_H hsa.h
PATHS ${HSA_RUNTIME_INCLUDE_DIRECTORIES}
PATH_SUFFIXES hsa
NO_DEFAULT_PATH REQUIRED)
get_filename_component(HSA_RUNTIME_INC_PATH ${HSA_H} DIRECTORY)
include_directories(${HSA_RUNTIME_INC_PATH})
if(NOT CMAKE_HIP_COMPILER)
find_program(
amdclangpp_EXECUTABLE
NAMES amdclang++
HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
PATH_SUFFIXES bin llvm/bin NO_CACHE)
mark_as_advanced(amdclangpp_EXECUTABLE)
if(amdclangpp_EXECUTABLE)
set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}")
endif()
endif()
set_source_files_properties(main.cpp PROPERTIES LANGUAGE HIP)
set(CMAKE_BUILD_TYPE Debug)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O1" )
add_library(intercept SHARED)
target_include_directories(intercept PRIVATE ${HSA_RUNTIME_INC_PATH} ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/src/core/include/ /opt/rocm/include)
target_sources(intercept PRIVATE intercept.cpp)
target_link_libraries(intercept PRIVATE hsa-runtime64::hsa-runtime64 ${hsa-amd-aqlprofile64})
target_link_options(intercept PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exportmap -Wl,--no-undefined)
target_compile_definitions(intercept PUBLIC AMD_INTERNAL_BUILD)
add_executable(testv2)
target_sources(testv2 PRIVATE main.cpp workload.cpp counter.cpp agent.cpp)
target_include_directories(testv2 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/src/core/include/ ${HSA_RUNTIME_INC_PATH} /opt/rocm/include)
target_link_libraries(testv2 PRIVATE hsa-runtime64::hsa-runtime64 ${hsa-amd-aqlprofile64})
target_compile_definitions(testv2 PUBLIC AMD_INTERNAL_BUILD)
# Add a PRELOAD environment with libintercept
set(ENV{LD_PRELOAD} "$ENV{LD_PRELOAD}:${CMAKE_CURRENT_BINARY_DIR}/libintercept.so")
add_test(NAME testv2 COMMAND testv2)
set_tests_properties(testv2 PROPERTIES ENVIRONMENT "${LD_PRELOAD}" TIMEOUT 45 LABELS "unittests" FAIL_REGULAR_EXPRESSION "${AQLPROFILE_DEFAULT_FAIL_REGEX}")
@@ -0,0 +1,235 @@
// MIT License
//
// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "agent.hpp"
#include <cstring>
#define CHECK_HSA(x) if ((x) != HSA_STATUS_SUCCESS) { std::cerr << __FILE__ << " error at " << __LINE__ << std::endl; exit(-1); }
std::vector<std::shared_ptr<AgentInfo>> AgentInfo::gpu_agents{};
hsa_agent_t AgentInfo::cpu_agent{0};
hsa_amd_memory_pool_t AgentInfo::cpu_pool;
hsa_amd_memory_pool_t AgentInfo::kernarg_pool;
void AgentInfo::add_event(aqlprofile_pmc_event_t block, const std::string& counter, int block_cnt, int event_id)
{
block.event_id = event_id;
std::vector<aqlprofile_pmc_event_t> cnt{};
for (int i=0; i<block_cnt; i++)
{
block.block_index = i;
cnt.push_back(block);
}
counters[counter] = std::move(cnt);
}
hsa_status_t AgentInfo::get_agent_handle_cb(hsa_agent_t agent, void* userdata)
{
hsa_device_type_t type;
CHECK_HSA(hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type));
if (type == HSA_DEVICE_TYPE_CPU)
{
cpu_agent = agent;
return HSA_STATUS_SUCCESS;
}
std::shared_ptr<AgentInfo> info = std::make_shared<AgentInfo>();
info->hsa_agent = agent;
CHECK_HSA(hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, info->gfxip.data()));
CHECK_HSA(hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_XCC), &info->info.xcc_num));
CHECK_HSA(hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), &info->info.se_num));
CHECK_HSA(hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), &info->info.cu_num));
CHECK_HSA(hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE), &info->info.shader_arrays_per_se));
info->info.agent_gfxip = info->gfxip.data();
CHECK_HSA(aqlprofile_register_agent(&info->handle, &info->info));
aqlprofile_pmc_event_flags_t flags{.raw = 0};
aqlprofile_pmc_event_t grbm {.block_index = 0, .flags = flags, .block_name = HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBM};
aqlprofile_pmc_event_t sq {.block_index = 0, .flags = flags, .block_name = HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ};
aqlprofile_pmc_event_t ta {.block_index = 0, .flags = flags, .block_name = HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TA};
aqlprofile_pmc_event_t tcp {.block_index = 0, .flags = flags, .block_name = HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCP};
aqlprofile_pmc_event_t tcc {.block_index = 0, .flags = flags, .block_name = HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCC};
aqlprofile_pmc_event_t gl2c {.block_index = 0, .flags = flags, .block_name = HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2C};
info->add_event(grbm, "GRBM_COUNT", 1, 0);
info->add_event(grbm, "GRBM_GUI_ACTIVE", 1, 2);
info->add_event(sq, "SQ_WAVES", 1, 4);
info->add_event(sq, "SQ_BUSY_CYCLES", 1, 3);
info->add_event(sq, "SQ_INSTS_VALU", 1, (info->gfxip.find("gfx1")==0) ? 62 : 26);
info->add_event(ta, "TA_BUSY", 16, (info->gfxip.find("gfx94") != 0 || info->gfxip.find("gfx95") != 0) ? 13 : 15);
if (info->gfxip.find("gfx1") == 0)
{
info->add_event(gl2c, "GL2C_REQ", 32, 3);
info->add_event(gl2c, "GL2C_READ", 32, 6);
info->add_event(gl2c, "GL2C_WRITE", 32, 7);
}
else if (info->gfxip.find("gfx95") == 0)
{
info->add_event(sq, "SQ_INSTS_VALU_FLOPS_FP16", 10, 81);
info->add_event(sq, "SQ_INSTS_VALU_FLOPS_FP32", 10, 82);
info->add_event(sq, "SQ_INSTS_VALU_FLOPS_FP64", 10, 83);
info->add_event(sq, "SQ_INSTS_VALU_FLOPS_FP16_TRANS", 10, 84);
info->add_event(sq, "SQ_INSTS_VALU_FLOPS_FP32_TRANS", 10, 85);
info->add_event(sq, "SQ_INSTS_VALU_FLOPS_FP64_TRANS", 10, 86);
info->add_event(tcp, "TCP_READ", 10, 28);
info->add_event(tcp, "TCP_WRITE", 10, 30);
info->add_event(tcp, "TCP_CACHE_ACCESS", 10, 58);
info->add_event(tcp, "TCP_CACHE_MISS_TG0", 10, 59);
info->add_event(tcp, "TCP_CACHE_MISS_TG1", 10, 60);
info->add_event(tcp, "TCP_CACHE_MISS_TG2", 10, 61);
info->add_event(tcp, "TCP_CACHE_MISS_TG3", 10, 62);
info->add_event(tcp, "TCP_CACHE_MISS", 10, 63);
info->add_event(tcc, "TCC_EA0_RDREQ", 16, 42);
info->add_event(tcc, "TCC_EA0_RDREQ_DRAM", 16, 108);
info->add_event(tcc, "TCC_EA0_WRREQ_DRAM", 16, 109);
info->add_event(tcc, "TCC_EA0_WRREQ_WRITE_DRAM", 16, 110);
info->add_event(tcc, "TCC_EA0_WRREQ_ATOMIC_DRAM", 16, 111);
info->add_event(tcc, "TCC_EA0_RDREQ_DRAM_32B", 16, 112);
info->add_event(tcc, "TCC_EA0_RDREQ_GMI_32B", 16, 113);
info->add_event(tcc, "TCC_EA0_RDREQ_IO_32B", 16, 114);
info->add_event(tcc, "TCC_EA0_WRREQ_WRITE_DRAM_32B", 16, 115);
info->add_event(tcc, "TCC_EA0_WRREQ_ATOMIC_DRAM_32B", 16, 116);
info->add_event(tcc, "TCC_EA0_WRREQ_WRITE_GMI_32B", 16, 117);
info->add_event(tcc, "TCC_EA0_WRREQ_ATOMIC_GMI_32B", 16, 118);
info->add_event(tcc, "TCC_EA0_WRREQ_WRITE_IO_32B", 16, 119);
info->add_event(tcc, "TCC_EA0_WRREQ_ATOMIC_IO_32B", 16, 119);
}
else if (info->gfxip.find("gfx94") == 0)
{
info->add_event(tcc, "TCC_REQ", 16, 3);
info->add_event(tcc, "TCC_ATOMIC", 16, 14);
info->add_event(tcc, "TCC_EA0_ATOMIC", 16, 36);
info->add_event(tcc, "TCC_EA0_WRREQ_CREDIT_STALL", 16, 30);
info->add_event(tcc, "TCC_EA0_WRREQ_IO_CREDIT_STALL", 16, 31);
info->add_event(tcc, "TCC_EA0_WRREQ_GMI_CREDIT_STALL", 16, 32);
info->add_event(tcc, "TCC_EA0_WRREQ_DRAM_CREDIT_STALL", 16, 33);
info->add_event(tcc, "TCC_EA0_RDREQ", 16, 38);
info->add_event(tcc, "TCC_EA0_RDREQ_IO_CREDIT_STALL", 16, 41);
info->add_event(tcc, "TCC_EA0_RDREQ_GMI_CREDIT_STALL", 16, 42);
info->add_event(tcc, "TCC_EA0_RDREQ_DRAM_CREDIT_STALL", 16, 43);
info->add_event(tcp, "TCP_READ", 10, 28);
info->add_event(tcp, "TCP_WRITE", 10, 30);
info->add_event(tcp, "TCP_CACHE_ACCESS", 10, 60);
info->add_event(tcp, "TCP_CACHE_MISS_TG0", 10, 61);
info->add_event(tcp, "TCP_CACHE_MISS_TG1", 10, 62);
info->add_event(tcp, "TCP_CACHE_MISS_TG2", 10, 63);
info->add_event(tcp, "TCP_CACHE_MISS_TG3", 10, 64);
}
else if (info->gfxip.find("gfx90a") == 0)
{
info->add_event(tcp, "TCP_READ", 16, 30);
info->add_event(tcp, "TCP_WRITE", 16, 32);
}
else if (info->gfxip.find("gfx900") == 0)
{
info->add_event(tcp, "TCP_READ", 16, 30);
info->add_event(tcp, "TCP_WRITE", 16, 32);
}
else
{
assert(false);
}
gpu_agents.push_back(info);
return HSA_STATUS_SUCCESS;
}
hsa_status_t
FindGlobalPool(hsa_amd_memory_pool_t pool, void* data)
{
hsa_amd_segment_t segment;
CHECK_HSA(hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment));
if(HSA_AMD_SEGMENT_GLOBAL != segment) return HSA_STATUS_SUCCESS;
uint32_t flag;
CHECK_HSA(hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag));
uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT;
if (karg_st)
AgentInfo::kernarg_pool = pool;
else
AgentInfo::cpu_pool = pool;
return HSA_STATUS_SUCCESS;
}
void AgentInfo::iterate_agents()
{
CHECK_HSA(hsa_iterate_agents(get_agent_handle_cb, nullptr));
CHECK_HSA(hsa_amd_agent_iterate_memory_pools(cpu_agent, FindGlobalPool, nullptr));
}
bool
Queue::Submit(hsa_ext_amd_aql_pm4_packet_t* packet)
{
const uint64_t write_idx = hsa_queue_add_write_index_relaxed(queue, 1);
size_t index = (write_idx % queue->size) * sizeof(hsa_ext_amd_aql_pm4_packet_t);
auto* queue_slot = reinterpret_cast<uint32_t*>(size_t(queue->base_address) + index); // NOLINT
const auto* slot_data = reinterpret_cast<const uint32_t*>(packet);
std::memcpy(&queue_slot[1], &slot_data[1], sizeof(hsa_ext_amd_aql_pm4_packet_t) - sizeof(uint32_t));
auto* header = reinterpret_cast<std::atomic<uint32_t>*>(queue_slot);
header->store(slot_data[0], std::memory_order_release);
hsa_signal_store_screlease(queue->doorbell_signal, write_idx);
int loops = 0;
while(hsa_queue_load_read_index_relaxed(queue) <= write_idx)
{
loops++;
usleep(1);
if(loops > 10000)
{
std::cerr << "Codeobj packet submission failed!" << std::endl;
return false;
}
}
return true;
}
Queue::Queue(std::shared_ptr<AgentInfo>& _agent): agent(_agent)
{
CHECK_HSA(hsa_queue_create(agent->hsa_agent, 64, HSA_QUEUE_TYPE_SINGLE, NULL, NULL, UINT32_MAX, UINT32_MAX, &this->queue));
}
void Queue::flush()
{
return;
hsa_barrier_and_packet_t barrier{};
barrier.header = HSA_PACKET_TYPE_BARRIER_OR | (1<<HSA_PACKET_HEADER_BARRIER);
barrier.header |= HSA_FENCE_SCOPE_SYSTEM<<HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE;
barrier.header |= HSA_FENCE_SCOPE_SYSTEM<<HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE;
Submit((hsa_ext_amd_aql_pm4_packet_t*)&barrier);
}
@@ -0,0 +1,88 @@
// MIT License
//
// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include <assert.h>
#include <stdlib.h>
#include <unistd.h>
#include <atomic>
#include <chrono>
#include <csignal>
#include <cstddef>
#include <cstdio>
#include <iostream>
#include <vector>
#include <map>
#include <atomic>
#include <future>
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>
#include <hsa/hsa_ven_amd_aqlprofile.h>
//#include "/opt/rocm/include/aqlprofile-sdk/aql_profile_v2.h"
#include "aql_profile_v2.h"
#define CHECK_HSA(x) if ((x) != HSA_STATUS_SUCCESS) { std::cerr << __FILE__ << " error at " << __LINE__ << std::endl; exit(-1); }
inline bool operator==(const aqlprofile_pmc_event_t& a, const aqlprofile_pmc_event_t& b) {
return a.event_id == b.event_id && a.block_name == b.block_name;
}
inline bool operator<(const aqlprofile_pmc_event_t& a, const aqlprofile_pmc_event_t& b) {
return (a.block_name == b.block_name) ? (a.event_id < b.event_id) : (a.block_name < b.block_name);
}
struct AgentInfo
{
AgentInfo() { gfxip.resize(64); }
std::string gfxip;
hsa_agent_t hsa_agent;
aqlprofile_agent_info_t info;
aqlprofile_agent_handle_t handle;
std::map<std::string, std::vector<aqlprofile_pmc_event_t>> counters;
static void iterate_agents();
static std::vector<std::shared_ptr<AgentInfo>> gpu_agents;
static hsa_agent_t cpu_agent;
static hsa_amd_memory_pool_t cpu_pool;
static hsa_amd_memory_pool_t kernarg_pool;
private:
void add_event(aqlprofile_pmc_event_t block, const std::string& counter, int block_cnt, int event_id);
static hsa_status_t get_agent_handle_cb(hsa_agent_t agent, void* userdata);
};
class Queue
{
public:
Queue(std::shared_ptr<AgentInfo>& _agent);
~Queue() { if (queue) hsa_queue_destroy(queue); }
bool Submit(hsa_ext_amd_aql_pm4_packet_t* packet);
void flush();
hsa_queue_t* queue = nullptr;
std::shared_ptr<AgentInfo> agent;
};
@@ -0,0 +1,127 @@
// MIT License
//
// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <assert.h>
#include "counter.hpp"
#include <cstring>
#define CHECK_HSA(x) if ((x) != HSA_STATUS_SUCCESS) { std::cerr << __FILE__ << " error at " << __LINE__ << std::endl; exit(-1); }
hsa_status_t data_callback(
aqlprofile_pmc_event_t event,
uint64_t counter_id,
uint64_t counter_value,
void* userdata
) {
auto* packet = static_cast<AQLPacket*>(userdata);
try {
packet->results.at(event) += counter_value;
} catch(...) { abort(); }
return HSA_STATUS_SUCCESS;
}
hsa_status_t
AQLPacket::Alloc(void** ptr, size_t size, desc_t flags, void* data)
{
auto* packet = reinterpret_cast<AQLPacket*>(data);
assert(packet && "Invalid aql packet");
if (flags.memory_hint != AQLPROFILE_MEMORY_HINT_DEVICE_UNCACHED)
{
CHECK_HSA(hsa_amd_memory_pool_allocate(AgentInfo::cpu_pool, size,
HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG, ptr));
CHECK_HSA(hsa_amd_memory_fill(*ptr, 0u, size / sizeof(uint32_t)));
return hsa_amd_agents_allow_access(1, &packet->hsa_agent, nullptr, *ptr);
}
else
{
CHECK_HSA(hsa_amd_memory_pool_allocate(AgentInfo::kernarg_pool, size,
HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG, ptr));
CHECK_HSA(hsa_amd_memory_fill(*ptr, 0u, size / sizeof(uint32_t)));
return hsa_amd_agents_allow_access(1, &packet->hsa_agent, nullptr, *ptr);
}
}
void AQLPacket::Free(void* ptr, void* data)
{
if(ptr == nullptr) return;
hsa_amd_memory_pool_free(ptr);
}
hsa_status_t AQLPacket::Copy(void* dst, const void* src, size_t size, void* data)
{
if(size == 0) return HSA_STATUS_SUCCESS;
return hsa_memory_copy(dst, src, size);
}
AQLPacket::AQLPacket(
AgentInfo& agent,
const std::vector<std::string>& _counters
): hsa_agent(agent.hsa_agent) {
constexpr hsa_ext_amd_aql_pm4_packet_t null_amd_aql_pm4_packet =
{
.header = 0,
.pm4_command = {0},
.completion_signal = {.handle = 0}
};
packets.start_packet = null_amd_aql_pm4_packet;
packets.stop_packet = null_amd_aql_pm4_packet;
packets.read_packet = null_amd_aql_pm4_packet;
aqlprofile_pmc_profile_t profile{};
std::vector<aqlprofile_pmc_event_t> events;
for (auto& counter : _counters)
{
auto& event = agent.counters.at(counter).at(0);
results[event] = 0;
prev_results[event] = 0;
counter_names[event] = counter;
for (auto& ev : agent.counters.at(counter))
events.push_back(ev);
}
profile.agent = agent.handle;
profile.events = events.data();
profile.event_count = static_cast<uint32_t>(events.size());
CHECK_HSA(aqlprofile_pmc_create_packets(&this->handle,
&this->packets,
profile,
&AQLPacket::Alloc,
&AQLPacket::Free,
&AQLPacket::Copy,
this));
packets.start_packet.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE;
packets.stop_packet.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE;
packets.read_packet.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE;
}
void AQLPacket::iterate()
{
for (auto& [key, value] : results)
{
prev_results[key] = value;
results[key] = 0;
}
CHECK_HSA(aqlprofile_pmc_iterate_data(this->handle, data_callback, this));
}
@@ -0,0 +1,72 @@
// MIT License
//
// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include <assert.h>
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>
#include <hsa/hsa_ven_amd_aqlprofile.h>
#include "aql_profile_v2.h"
#include <stdlib.h>
#include <unistd.h>
#include <atomic>
#include <chrono>
#include <csignal>
#include <cstddef>
#include <cstdio>
#include <iostream>
#include <vector>
#include <map>
#include "agent.hpp"
class AQLPacket
{
using desc_t = aqlprofile_buffer_desc_flags_t;
public:
AQLPacket(AgentInfo& _agent, const std::vector<std::string>& counters);
~AQLPacket() { aqlprofile_pmc_delete_packets(this->handle); };
void iterate();
static hsa_status_t Alloc(void** ptr, size_t size, desc_t flags, void* data);
static void Free(void* ptr, void* data);
static hsa_status_t Copy(void* dst, const void* src, size_t size, void* data);
std::map<std::string, int64_t> get()
{
std::map<std::string, int64_t> ret;
for (auto& [event, counter] : counter_names)
ret.emplace(counter, results.at(event));
return ret;
}
std::map<aqlprofile_pmc_event_t, std::string> counter_names;
std::map<aqlprofile_pmc_event_t, int64_t> results;
std::map<aqlprofile_pmc_event_t, int64_t> prev_results;
aqlprofile_handle_t handle{0};
hsa_agent_t hsa_agent;
aqlprofile_pmc_aql_packets_t packets;
bool delta = false;
};
@@ -0,0 +1,4 @@
{
global: HSA_AMD_TOOL_PRIORITY; OnLoad;
local: *;
};
@@ -0,0 +1,168 @@
// MIT License
//
// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <assert.h>
#include <stdio.h>
#include <algorithm>
#include <stdlib.h>
#include <iostream>
#include <unistd.h>
#include <vector>
#include <atomic>
#include <cstring>
#include <hsa/hsa.h>
#include <hsa_ext_amd.h>
#include <hsa_api_trace.h>
#include <hsa_ven_amd_aqlprofile.h>
#define CHECK_HSA(x) { auto _status = (x); if (_status != HSA_STATUS_SUCCESS) { std::cerr << __FILE__ << ':' << __LINE__ << std::endl; abort(); } }
extern "C" const uint32_t HSA_AMD_TOOL_PRIORITY = 25;
decltype(hsa_amd_profiling_set_profiler_enabled)* hsa_amd_profiling_set_profiler_enabled_fn = nullptr;
decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate_fn = nullptr;
decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access_fn = nullptr;
decltype(hsa_amd_memory_pool_free)* hsa_amd_memory_pool_free_fn = nullptr;
decltype(hsa_signal_store_screlease)* hsa_signal_store_screlease_fn = nullptr;
decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed_fn = nullptr;
decltype(hsa_queue_add_write_index_relaxed)* hsa_queue_add_write_index_relaxed_fn = nullptr;
decltype(hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info_fn = nullptr;
decltype(hsa_agent_get_info)* hsa_agent_get_info_fn = nullptr;
decltype(hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools_fn = nullptr;
decltype(hsa_queue_create)* hsa_queue_create_fn = nullptr;
hsa_amd_memory_pool_t cpu_pool;
hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t pool, void* data)
{
hsa_amd_segment_t segment;
CHECK_HSA(hsa_amd_memory_pool_get_info_fn(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment));
if(HSA_AMD_SEGMENT_GLOBAL != segment) return HSA_STATUS_SUCCESS;
uint32_t flag;
CHECK_HSA(hsa_amd_memory_pool_get_info_fn(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag));
uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT;
if (karg_st == 0) cpu_pool = pool;
return HSA_STATUS_SUCCESS;
}
hsa_status_t iterate_agent_cb(hsa_agent_t agent, void* userdata)
{
hsa_device_type_t type;
CHECK_HSA(hsa_agent_get_info_fn(agent, HSA_AGENT_INFO_DEVICE, &type));
if (type != HSA_DEVICE_TYPE_CPU) return HSA_STATUS_SUCCESS;
CHECK_HSA(hsa_amd_agent_iterate_memory_pools_fn(agent, FindGlobalPool, nullptr));
return HSA_STATUS_SUCCESS;
}
bool queue_submit(hsa_queue_t* queue, hsa_ext_amd_aql_pm4_packet_t* packet)
{
const uint64_t write_idx = hsa_queue_add_write_index_relaxed_fn(queue, 1);
size_t index = (write_idx % queue->size) * sizeof(hsa_ext_amd_aql_pm4_packet_t);
auto* queue_slot = reinterpret_cast<uint32_t*>(size_t(queue->base_address) + index); // NOLINT
const auto* slot_data = reinterpret_cast<const uint32_t*>(packet);
std::memcpy(&queue_slot[1], &slot_data[1], sizeof(hsa_ext_amd_aql_pm4_packet_t) - sizeof(uint32_t));
auto* header = reinterpret_cast<std::atomic<uint32_t>*>(queue_slot);
header->store(slot_data[0], std::memory_order_release);
hsa_signal_store_screlease_fn(queue->doorbell_signal, write_idx);
int loops = 0;
while(hsa_queue_load_read_index_relaxed_fn(queue) <= write_idx)
{
loops++;
usleep(1);
if(loops > 10000)
{
std::cerr << "Packet submission failed!" << std::endl;
return false;
}
}
return true;
}
void set_profiler_active_on_queue(
hsa_agent_t hsa_agent,
hsa_queue_t* queue
) {
hsa_ext_amd_aql_pm4_packet_t packet{};
hsa_ven_amd_aqlprofile_profile_t profile{};
profile.agent = hsa_agent;
// Query for cmd buffer size
CHECK_HSA(hsa_ven_amd_aqlprofile_get_info(&profile, HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD, nullptr));
// Allocate cmd buffer
const size_t mask = 0x1000 - 1;
auto size = (profile.command_buffer.size + mask) & ~mask;
CHECK_HSA(hsa_amd_memory_pool_allocate_fn(cpu_pool, size,
HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG, &profile.command_buffer.ptr));
CHECK_HSA(hsa_amd_agents_allow_access_fn(1, &hsa_agent, nullptr, profile.command_buffer.ptr));
CHECK_HSA(hsa_ven_amd_aqlprofile_get_info(&profile, HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD, &packet));
queue_submit(queue, &packet);
hsa_amd_memory_pool_free_fn(profile.command_buffer.ptr);
}
hsa_status_t QueueCreateInterceptor(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type,
void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data),
void* data, uint32_t private_segment_size,
uint32_t group_segment_size, hsa_queue_t** queue)
{
CHECK_HSA(hsa_queue_create_fn(agent, size, type, callback, data, private_segment_size, group_segment_size, queue));
//CHECK_HSA(hsa_amd_profiling_set_profiler_enabled_fn(*queue, true));
set_profiler_active_on_queue(agent, *queue);
return HSA_STATUS_SUCCESS;
}
extern "C" __attribute__((visibility("default"))) bool
OnLoad(HsaApiTable* table, uint64_t, uint64_t, const char* const*)
{
hsa_queue_create_fn = table->core_->hsa_queue_create_fn;
// Install the Queue intercept
table->core_->hsa_queue_create_fn = QueueCreateInterceptor;
hsa_amd_profiling_set_profiler_enabled_fn = table->amd_ext_->hsa_amd_profiling_set_profiler_enabled_fn;
hsa_amd_memory_pool_allocate_fn = table->amd_ext_->hsa_amd_memory_pool_allocate_fn;
hsa_amd_agents_allow_access_fn = table->amd_ext_->hsa_amd_agents_allow_access_fn;
hsa_amd_memory_pool_free_fn = table->amd_ext_->hsa_amd_memory_pool_free_fn;
hsa_signal_store_screlease_fn = table->core_->hsa_signal_store_screlease_fn;
hsa_queue_load_read_index_relaxed_fn = table->core_->hsa_queue_load_read_index_relaxed_fn;
hsa_queue_add_write_index_relaxed_fn = table->core_->hsa_queue_add_write_index_relaxed_fn;
hsa_amd_memory_pool_get_info_fn = table->amd_ext_->hsa_amd_memory_pool_get_info_fn;
hsa_amd_agent_iterate_memory_pools_fn = table->amd_ext_->hsa_amd_agent_iterate_memory_pools_fn;
hsa_agent_get_info_fn = table->core_->hsa_agent_get_info_fn;
CHECK_HSA(table->core_->hsa_iterate_agents_fn(iterate_agent_cb, nullptr));
return true;
}
@@ -0,0 +1,419 @@
// MIT License
//
// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <assert.h>
#include <stdlib.h>
#include <unistd.h>
#include <atomic>
#include <chrono>
#include <csignal>
#include <cstddef>
#include <cstdio>
#include <iostream>
#include <vector>
#include <map>
#include <atomic>
#include <future>
#include "counter.hpp"
#include "workload.hpp"
#include "hip/hip_runtime.h"
#define DATA_SIZE (64*4)
#define HIP_API_CALL(CALL) do { if ((CALL) != hipSuccess) abort(); } while(0)
//#define ATTEMPT_GMI
#define DOT2_ARCH
class hipMemory
{
public:
hipMemory(size_t size)
{
HIP_API_CALL(hipMalloc(&ptr, size * sizeof(float)));
HIP_API_CALL(hipMemset(ptr, 0, size * sizeof(float)));
}
~hipMemory()
{
if(ptr) HIP_API_CALL(hipFree(ptr));
}
hipMemory(hipMemory& other) = delete;
hipMemory& operator=(hipMemory& other) = delete;
float* ptr = nullptr;
};
class Stream
{
public:
Stream() { HIP_API_CALL(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); }
~Stream() { HIP_API_CALL(hipStreamDestroy(stream)); }
Stream(Stream& other) = delete;
Stream& operator=(Stream& other) = delete;
void synchronize() { HIP_API_CALL(hipStreamSynchronize(stream)); }
hipStream_t stream;
};
class HIPWorkload : public IWorkload
{
public:
HIPWorkload(AgentInfo& agent, const std::vector<std::string>& counters)
{
col = std::make_unique<Collection>(agent, counters);
}
virtual ~HIPWorkload() {};
virtual std::string_view name() = 0;
std::map<std::string, int64_t> collect(Queue& queue)
{
assert(col);
return col->iterate(queue, *this);
}
void printcounters(Queue& queue)
{
std::cout << "Name: " << name() << std::endl;
for (auto& [name, v] : collect(queue)) std::cout << " - " << name << ": " << v << std::endl;
}
std::unique_ptr<Collection> col{nullptr};
hipMemory src{DATA_SIZE};
hipMemory dst{DATA_SIZE};
Stream stream{};
};
__global__ void copy_kernel(float* a, const float* b)
{
int idx = threadIdx.x + blockIdx.x*blockDim.x;
if (idx < DATA_SIZE)
a[idx] = b[idx];
}
__global__ void atomic_kernel(float* a, const float* b)
{
int idx = threadIdx.x + blockIdx.x*blockDim.x;
if (idx < DATA_SIZE)
atomicAdd(a+threadIdx.x, b[idx]);
}
__global__ void iops_kernel_trans()
{
// 3 F16 Trans OPS
asm volatile("v_cos_f16 v0, v0; v_cos_f16 v1, v1; v_cos_f16 v2, v2;");
// 2 F32 Trans OPS
asm volatile("v_cos_f32 v3, v3; v_cos_f32 v4, v4");
}
__global__ void iops_kernel1()
{
asm volatile("v_add_f16 v2, v1, v0"); // 1 F16 OPS
asm volatile("v_fma_f32 v3, v1, v2, v3"); // 2 F32 OPs
asm volatile("v_add_f64 v[0:1], v[2:3], v[4:5]"); // 1 F64 OP
asm volatile("v_fma_f64 v[0:1], v[2:3], v[4:5], v[6:7]"); // 2 F64 OP
asm volatile("v_fma_f64 v[0:1], v[2:3], v[4:5], v[6:7]"); // 2 F64 OP
}
__global__ void iops_kernel2()
{
#if defined(__gfx940__) || defined(__gfx90a__) || defined(__gfx1030__)
// Supported architectures
asm volatile("v_dot2_f32_f16 v0, v1, v2, v3");
#else
// Fallback or skip
asm volatile("v_add_f32 v4, v5, v6"); // 1 F32 OP
asm volatile("v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]"); // 2 F64 OPs
#endif
}
class CopyWorkload : public HIPWorkload
{
public:
CopyWorkload(AgentInfo& agent, const std::vector<std::string>& counters): HIPWorkload(agent, counters) {}
virtual void run() override
{
copy_kernel<<<DATA_SIZE/64,64,0,stream.stream>>>(dst.ptr, src.ptr);
stream.synchronize();
}
virtual std::string_view name() override { return "CopyWorkload"; };
};
class AtomicWorkload : public HIPWorkload
{
public:
AtomicWorkload(AgentInfo& agent, const std::vector<std::string>& counters): HIPWorkload(agent, counters) {}
virtual void run() override
{
atomic_kernel<<<DATA_SIZE/64,64,0,stream.stream>>>(dst.ptr, src.ptr);
stream.synchronize();
}
virtual std::string_view name() override { return "AtomicWorkload"; };
};
class IOPSWorkload1 : public HIPWorkload
{
public:
IOPSWorkload1(AgentInfo& agent, const std::vector<std::string>& counters): HIPWorkload(agent, counters) {}
virtual void run() override
{
iops_kernel1<<<DATA_SIZE/64,64,0,stream.stream>>>();
stream.synchronize();
}
virtual std::string_view name() override { return "IOPSWorkload1"; };
};
class IOPSWorkload2 : public HIPWorkload
{
public:
IOPSWorkload2(AgentInfo& agent, const std::vector<std::string>& counters): HIPWorkload(agent, counters) {}
virtual void run() override
{
iops_kernel2<<<DATA_SIZE/64,64,0,stream.stream>>>();
stream.synchronize();
}
virtual std::string_view name() override { return "IOPSWorkload2"; };
};
class IOPSWorkload3 : public HIPWorkload
{
public:
IOPSWorkload3(AgentInfo& agent, const std::vector<std::string>& counters): HIPWorkload(agent, counters) {}
virtual void run() override
{
iops_kernel_trans<<<DATA_SIZE/64,64,0,stream.stream>>>();
stream.synchronize();
}
virtual std::string_view name() override { return "Trans IOPSWorkload"; };
};
class GMIWorkload : public HIPWorkload
{
public:
GMIWorkload(AgentInfo& agent, const std::vector<std::string>& counters): HIPWorkload(agent, counters) {}
virtual void run() override
{
auto policies = std::vector<unsigned>{hipHostMallocDefault, hipHostMallocCoherent, hipHostMallocNonCoherent};
for (auto& policy : policies)
{
float* srchost;
float* dsthost;
HIP_API_CALL(hipHostMalloc(&srchost, DATA_SIZE * sizeof(float), policy));
HIP_API_CALL(hipHostMalloc(&dsthost, DATA_SIZE * sizeof(float), policy));
for (size_t i=0; i<DATA_SIZE; i++)
srchost[i] = float(i);
copy_kernel<<<DATA_SIZE/64,64,0,stream.stream>>>(dsthost, srchost);
stream.synchronize();
atomic_kernel<<<DATA_SIZE/64,64,0,stream.stream>>>(srchost, dsthost);
stream.synchronize();
copy_kernel<<<DATA_SIZE/64,64,0,stream.stream>>>(dst.ptr, src.ptr);
stream.synchronize();
HIP_API_CALL(hipHostFree(srchost));
HIP_API_CALL(hipHostFree(dsthost));
}
}
virtual std::string_view name() override { return "GMIWorkload"; };
};
auto tcp1_counters(std::string_view gfxip)
{
std::vector<std::string> counters = {"GRBM_COUNT", "SQ_WAVES", "SQ_INSTS_VALU"};
if (gfxip.find("gfx95") == 0)
{
counters.push_back("TCP_CACHE_ACCESS");
counters.push_back("TCP_CACHE_MISS");
counters.push_back("TCP_READ");
counters.push_back("TCP_WRITE");
counters.push_back("TCC_EA0_WRREQ_DRAM");
counters.push_back("TCC_EA0_WRREQ_WRITE_DRAM");
counters.push_back("TCC_EA0_WRREQ_WRITE_DRAM_32B");
counters.push_back("TCC_EA0_WRREQ_ATOMIC_DRAM");
}
else if (gfxip.find("gfx94") == 0)
{
counters.push_back("TCP_READ");
counters.push_back("TCP_WRITE");
counters.push_back("TCC_REQ");
counters.push_back("TCC_EA0_RDREQ");
counters.push_back("TCC_ATOMIC");
counters.push_back("TCC_EA0_ATOMIC");
}
return counters;
}
auto tcp2_counters(std::string_view gfxip)
{
std::vector<std::string> counters = {"GRBM_COUNT", "SQ_WAVES", "SQ_INSTS_VALU"};
if (gfxip.find("gfx95") == 0)
{
counters.push_back("TCP_CACHE_MISS_TG0");
counters.push_back("TCP_CACHE_MISS_TG1");
counters.push_back("TCP_CACHE_MISS_TG2");
counters.push_back("TCP_CACHE_MISS_TG3");
}
else if (gfxip.find("gfx94") == 0)
{
counters.push_back("TCP_READ");
counters.push_back("TCP_WRITE");
counters.push_back("TCC_REQ");
counters.push_back("TCC_EA0_RDREQ");
counters.push_back("TCC_ATOMIC");
counters.push_back("TCC_EA0_ATOMIC");
}
return counters;
}
auto atomic_counters(std::string_view gfxip)
{
std::vector<std::string> counters = {"GRBM_COUNT", "SQ_WAVES", "SQ_INSTS_VALU"};
if (gfxip.find("gfx95") == 0)
{
counters.push_back("TCC_EA0_WRREQ_ATOMIC_DRAM");
counters.push_back("TCC_EA0_WRREQ_ATOMIC_DRAM_32B");
counters.push_back("TCC_EA0_WRREQ_DRAM");
counters.push_back("TCC_EA0_WRREQ_WRITE_DRAM");
}
else if (gfxip.find("gfx94") == 0)
{
counters.push_back("TCP_READ");
counters.push_back("TCP_WRITE");
counters.push_back("TCC_REQ");
counters.push_back("TCC_EA0_RDREQ");
counters.push_back("TCC_ATOMIC");
counters.push_back("TCC_EA0_ATOMIC");
}
return counters;
}
auto iops_counters(std::string_view gfxip)
{
std::vector<std::string> counters = {"GRBM_COUNT", "SQ_WAVES", "SQ_INSTS_VALU"};
if (gfxip.find("gfx95") == 0)
{
counters.push_back("SQ_INSTS_VALU_FLOPS_FP16");
counters.push_back("SQ_INSTS_VALU_FLOPS_FP32");
counters.push_back("SQ_INSTS_VALU_FLOPS_FP64");
counters.push_back("SQ_INSTS_VALU_FLOPS_FP16_TRANS");
counters.push_back("SQ_INSTS_VALU_FLOPS_FP32_TRANS");
counters.push_back("SQ_INSTS_VALU_FLOPS_FP64_TRANS");
}
return counters;
}
auto gmi_counters(std::string_view gfxip)
{
std::vector<std::string> counters = {"GRBM_COUNT", "SQ_WAVES", "SQ_INSTS_VALU"};
if (gfxip.find("gfx95") == 0)
{
counters.push_back("TCC_EA0_RDREQ");
counters.push_back("TCC_EA0_RDREQ_GMI_32B");
counters.push_back("TCC_EA0_WRREQ_GMI_32B");
counters.push_back("TCC_EA0_ATOMIC_GMI_32B");
}
else if (gfxip.find("gfx94") == 0)
{
counters.push_back("TCC_EA0_WRREQ_CREDIT_STALL");
counters.push_back("TCC_EA0_WRREQ_IO_CREDIT_STALL");
counters.push_back("TCC_EA0_WRREQ_GMI_CREDIT_STALL");
counters.push_back("TCC_EA0_WRREQ_DRAM_CREDIT_STALL");
}
return counters;
}
auto io_counters(std::string_view gfxip)
{
std::vector<std::string> counters = {"GRBM_COUNT", "SQ_WAVES", "SQ_INSTS_VALU"};
if (gfxip.find("gfx95") == 0)
{
counters.push_back("TCC_EA0_RDREQ");
counters.push_back("TCC_EA0_RDREQ_IO_32B");
counters.push_back("TCC_EA0_WRREQ_IO_32B");
counters.push_back("TCC_EA0_ATOMIC_IO_32B");
}
else if (gfxip.find("gfx94") == 0)
{
counters.push_back("TCC_EA0_RDREQ");
counters.push_back("TCC_EA0_RDREQ_IO_CREDIT_STALL");
counters.push_back("TCC_EA0_RDREQ_GMI_CREDIT_STALL");
counters.push_back("TCC_EA0_RDREQ_DRAM_CREDIT_STALL");
}
return counters;
}
void printcounters(const std::map<std::string, int64_t>& map)
{
for (auto& [name, v] : map) std::cout << " - " << name << ": " << v << std::endl;
}
int main()
{
CHECK_HSA(hsa_init());
AgentInfo::iterate_agents();
auto agent = AgentInfo::gpu_agents.at(0);
{
Queue queue(agent);
CopyWorkload tcp1(*agent, tcp1_counters(agent->gfxip));
CopyWorkload tcp2(*agent, tcp2_counters(agent->gfxip));
AtomicWorkload atomic(*agent, atomic_counters(agent->gfxip));
IOPSWorkload1 iops1(*agent, iops_counters(agent->gfxip));
IOPSWorkload2 iops2(*agent, iops_counters(agent->gfxip));
IOPSWorkload3 iops3(*agent, iops_counters(agent->gfxip));
// warmup
tcp1.run();
tcp2.run();
atomic.run();
// Test
tcp1.printcounters(queue);
tcp2.printcounters(queue);
atomic.printcounters(queue);
iops1.printcounters(queue);
iops2.printcounters(queue);
iops3.printcounters(queue);
#ifdef ATTEMPT_GMI
GMIWorkload(*agent, gmi_counters(agent->gfxip)).printcounters(queue);
GMIWorkload(*agent, io_counters(agent->gfxip)).printcounters(queue);
#endif
}
CHECK_HSA(hsa_shut_down());
return 0;
}
@@ -0,0 +1,68 @@
// MIT License
//
// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <assert.h>
#include <stdlib.h>
#include <unistd.h>
#include "counter.hpp"
#include <atomic>
#include <chrono>
#include <csignal>
#include <cstddef>
#include <cstdio>
#include <iostream>
#include <vector>
#include <map>
#include <atomic>
#include <future>
#include "workload.hpp"
#include "counter.hpp"
Collection::Collection(AgentInfo& agent, const std::vector<std::string>& counters)
: packet(std::make_unique<AQLPacket>(agent, counters)) {}
Collection::~Collection() {}
std::map<std::string, int64_t> Collection::iterate(Queue& queue, IWorkload& load)
{
start(queue);
load.run();
stop(queue);
return packet->get();
}
void Collection::start(Queue& queue)
{
assert(packet);
queue.flush();
queue.Submit(&packet->packets.start_packet);
}
void Collection::stop(Queue& queue)
{
assert(packet);
queue.flush();
queue.Submit(&packet->packets.read_packet);
queue.Submit(&packet->packets.stop_packet);
packet->iterate();
}
@@ -0,0 +1,61 @@
// MIT License
//
// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include <assert.h>
#include <stdlib.h>
#include <unistd.h>
#include "counter.hpp"
#include <atomic>
#include <chrono>
#include <csignal>
#include <cstddef>
#include <cstdio>
#include <iostream>
#include <vector>
#include <map>
#include <atomic>
#include <future>
#include "agent.hpp"
class IWorkload
{
public:
IWorkload() {}
virtual ~IWorkload() {};
virtual void run() = 0;
};
class Collection
{
public:
Collection(AgentInfo& agent, const std::vector<std::string>& counters);
virtual ~Collection();
std::map<std::string, int64_t> iterate(Queue& queue, IWorkload& load);
private:
void start(Queue& queue);
void stop(Queue& queue);
std::unique_ptr<AQLPacket> packet{nullptr};
};