4117f4de75
* Adding tests for code coverage
* integration tests
* Fixing gtest dependencies
* Removing gmock linkage
* build fixes
* Fixing Analyze job
* CMake updates
* Fixing PSDB
* Addressing feedback
* Adding licenses
* Addressing feedback
* updating dashboard
* more API tests
* Addressing feedback
* correcting Macro
* preloading libintercept
[ROCm/aqlprofile commit: eca6739e13]
169 строки
7.2 KiB
C++
169 строки
7.2 KiB
C++
// MIT License
|
|
//
|
|
// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files (the "Software"), to deal
|
|
// in the Software without restriction, including without limitation the rights
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions:
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in all
|
|
// copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
// SOFTWARE.
|
|
|
|
#include <assert.h>
|
|
#include <stdio.h>
|
|
#include <algorithm>
|
|
#include <stdlib.h>
|
|
#include <iostream>
|
|
#include <unistd.h>
|
|
#include <vector>
|
|
#include <atomic>
|
|
#include <cstring>
|
|
#include <hsa/hsa.h>
|
|
#include <hsa_ext_amd.h>
|
|
#include <hsa_api_trace.h>
|
|
#include <hsa_ven_amd_aqlprofile.h>
|
|
|
|
#define CHECK_HSA(x) { auto _status = (x); if (_status != HSA_STATUS_SUCCESS) { std::cerr << __FILE__ << ':' << __LINE__ << std::endl; abort(); } }
|
|
|
|
extern "C" const uint32_t HSA_AMD_TOOL_PRIORITY = 25;
|
|
decltype(hsa_amd_profiling_set_profiler_enabled)* hsa_amd_profiling_set_profiler_enabled_fn = nullptr;
|
|
decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate_fn = nullptr;
|
|
decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access_fn = nullptr;
|
|
decltype(hsa_amd_memory_pool_free)* hsa_amd_memory_pool_free_fn = nullptr;
|
|
decltype(hsa_signal_store_screlease)* hsa_signal_store_screlease_fn = nullptr;
|
|
decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed_fn = nullptr;
|
|
decltype(hsa_queue_add_write_index_relaxed)* hsa_queue_add_write_index_relaxed_fn = nullptr;
|
|
|
|
decltype(hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info_fn = nullptr;
|
|
decltype(hsa_agent_get_info)* hsa_agent_get_info_fn = nullptr;
|
|
decltype(hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools_fn = nullptr;
|
|
decltype(hsa_queue_create)* hsa_queue_create_fn = nullptr;
|
|
|
|
hsa_amd_memory_pool_t cpu_pool;
|
|
|
|
hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t pool, void* data)
|
|
{
|
|
hsa_amd_segment_t segment;
|
|
CHECK_HSA(hsa_amd_memory_pool_get_info_fn(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment));
|
|
|
|
if(HSA_AMD_SEGMENT_GLOBAL != segment) return HSA_STATUS_SUCCESS;
|
|
|
|
uint32_t flag;
|
|
CHECK_HSA(hsa_amd_memory_pool_get_info_fn(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag));
|
|
uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT;
|
|
|
|
if (karg_st == 0) cpu_pool = pool;
|
|
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
hsa_status_t iterate_agent_cb(hsa_agent_t agent, void* userdata)
|
|
{
|
|
hsa_device_type_t type;
|
|
|
|
CHECK_HSA(hsa_agent_get_info_fn(agent, HSA_AGENT_INFO_DEVICE, &type));
|
|
if (type != HSA_DEVICE_TYPE_CPU) return HSA_STATUS_SUCCESS;
|
|
|
|
CHECK_HSA(hsa_amd_agent_iterate_memory_pools_fn(agent, FindGlobalPool, nullptr));
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
bool queue_submit(hsa_queue_t* queue, hsa_ext_amd_aql_pm4_packet_t* packet)
|
|
{
|
|
const uint64_t write_idx = hsa_queue_add_write_index_relaxed_fn(queue, 1);
|
|
|
|
size_t index = (write_idx % queue->size) * sizeof(hsa_ext_amd_aql_pm4_packet_t);
|
|
auto* queue_slot = reinterpret_cast<uint32_t*>(size_t(queue->base_address) + index); // NOLINT
|
|
|
|
const auto* slot_data = reinterpret_cast<const uint32_t*>(packet);
|
|
|
|
std::memcpy(&queue_slot[1], &slot_data[1], sizeof(hsa_ext_amd_aql_pm4_packet_t) - sizeof(uint32_t));
|
|
auto* header = reinterpret_cast<std::atomic<uint32_t>*>(queue_slot);
|
|
|
|
header->store(slot_data[0], std::memory_order_release);
|
|
hsa_signal_store_screlease_fn(queue->doorbell_signal, write_idx);
|
|
|
|
int loops = 0;
|
|
while(hsa_queue_load_read_index_relaxed_fn(queue) <= write_idx)
|
|
{
|
|
loops++;
|
|
usleep(1);
|
|
if(loops > 10000)
|
|
{
|
|
std::cerr << "Packet submission failed!" << std::endl;
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void set_profiler_active_on_queue(
|
|
hsa_agent_t hsa_agent,
|
|
hsa_queue_t* queue
|
|
) {
|
|
hsa_ext_amd_aql_pm4_packet_t packet{};
|
|
hsa_ven_amd_aqlprofile_profile_t profile{};
|
|
profile.agent = hsa_agent;
|
|
|
|
// Query for cmd buffer size
|
|
CHECK_HSA(hsa_ven_amd_aqlprofile_get_info(&profile, HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD, nullptr));
|
|
|
|
// Allocate cmd buffer
|
|
const size_t mask = 0x1000 - 1;
|
|
auto size = (profile.command_buffer.size + mask) & ~mask;
|
|
|
|
CHECK_HSA(hsa_amd_memory_pool_allocate_fn(cpu_pool, size,
|
|
HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG, &profile.command_buffer.ptr));
|
|
CHECK_HSA(hsa_amd_agents_allow_access_fn(1, &hsa_agent, nullptr, profile.command_buffer.ptr));
|
|
|
|
CHECK_HSA(hsa_ven_amd_aqlprofile_get_info(&profile, HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD, &packet));
|
|
|
|
queue_submit(queue, &packet);
|
|
hsa_amd_memory_pool_free_fn(profile.command_buffer.ptr);
|
|
}
|
|
|
|
|
|
hsa_status_t QueueCreateInterceptor(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type,
|
|
void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data),
|
|
void* data, uint32_t private_segment_size,
|
|
uint32_t group_segment_size, hsa_queue_t** queue)
|
|
{
|
|
CHECK_HSA(hsa_queue_create_fn(agent, size, type, callback, data, private_segment_size, group_segment_size, queue));
|
|
//CHECK_HSA(hsa_amd_profiling_set_profiler_enabled_fn(*queue, true));
|
|
set_profiler_active_on_queue(agent, *queue);
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
extern "C" __attribute__((visibility("default"))) bool
|
|
OnLoad(HsaApiTable* table, uint64_t, uint64_t, const char* const*)
|
|
{
|
|
hsa_queue_create_fn = table->core_->hsa_queue_create_fn;
|
|
// Install the Queue intercept
|
|
table->core_->hsa_queue_create_fn = QueueCreateInterceptor;
|
|
hsa_amd_profiling_set_profiler_enabled_fn = table->amd_ext_->hsa_amd_profiling_set_profiler_enabled_fn;
|
|
hsa_amd_memory_pool_allocate_fn = table->amd_ext_->hsa_amd_memory_pool_allocate_fn;
|
|
hsa_amd_agents_allow_access_fn = table->amd_ext_->hsa_amd_agents_allow_access_fn;
|
|
hsa_amd_memory_pool_free_fn = table->amd_ext_->hsa_amd_memory_pool_free_fn;
|
|
hsa_signal_store_screlease_fn = table->core_->hsa_signal_store_screlease_fn;
|
|
hsa_queue_load_read_index_relaxed_fn = table->core_->hsa_queue_load_read_index_relaxed_fn;
|
|
hsa_queue_add_write_index_relaxed_fn = table->core_->hsa_queue_add_write_index_relaxed_fn;
|
|
hsa_amd_memory_pool_get_info_fn = table->amd_ext_->hsa_amd_memory_pool_get_info_fn;
|
|
hsa_amd_agent_iterate_memory_pools_fn = table->amd_ext_->hsa_amd_agent_iterate_memory_pools_fn;
|
|
hsa_agent_get_info_fn = table->core_->hsa_agent_get_info_fn;
|
|
|
|
CHECK_HSA(table->core_->hsa_iterate_agents_fn(iterate_agent_cb, nullptr));
|
|
|
|
return true;
|
|
}
|