[SWDEV-513658] Force HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG value to be used with HSA calls (#192)

* Force HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG  value to be used with HSA calls

Fix for CI

* More tweaks

* Increase reproducible-runtime kernel sleep granularity

* Fix data race in synchronous device counter collection sample

* Update device counting service

- add get_active_context function

---------

Co-authored-by: Benjamin Welton <bewelton@amd.com>
Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com>
Dieser Commit ist enthalten in:
Welton, Benjamin
2025-02-10 09:34:26 -08:00
committet von GitHub
Ursprung e67a4451d8
Commit 080b2ba451
9 geänderte Dateien mit 115 neuen und 42 gelöschten Zeilen
@@ -24,6 +24,7 @@
#include <atomic>
#include <chrono>
#include <cstdlib>
#include <fstream>
#include <functional>
#include <iostream>
@@ -37,6 +38,8 @@
#include <unordered_map>
#include <vector>
#include <rocprofiler-sdk/buffer.h>
#include <rocprofiler-sdk/context.h>
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/registration.h>
#include <rocprofiler-sdk/rocprofiler.h>
@@ -79,7 +82,7 @@ public:
// Get the dimensions of a record (what CU/SE/etc the counter is for). High cost operation
// should be cached if possible.
std::unordered_map<std::string, size_t> get_record_dimensions(
static std::unordered_map<std::string, size_t> get_record_dimensions(
const rocprofiler_record_counter_t& rec);
// Sample the counter values for a set of counters, returns the records in the out parameter.
@@ -89,6 +92,9 @@ public:
// Get the available agents on the system
static std::vector<rocprofiler_agent_v0_t> get_available_agents();
void flush() const { rocprofiler_flush_buffer(buf_); }
void stop() const { rocprofiler_stop_context(ctx_); }
private:
rocprofiler_agent_id_t agent_ = {};
rocprofiler_context_id_t ctx_ = {};
@@ -97,20 +103,21 @@ private:
std::map<std::vector<std::string>, rocprofiler_profile_config_id_t> cached_profiles_;
std::map<uint64_t, uint64_t> profile_sizes_;
mutable std::map<uint64_t, std::string> id_to_name_;
// Internal function used to set the profile for the agent when start_context is called
void set_profile(rocprofiler_context_id_t ctx,
rocprofiler_agent_set_profile_callback_t cb) const;
// Get the size of a counter in number of records
size_t get_counter_size(rocprofiler_counter_id_t counter);
static size_t get_counter_size(rocprofiler_counter_id_t counter);
// Get the supported counters for an agent
static std::unordered_map<std::string, rocprofiler_counter_id_t> get_supported_counters(
rocprofiler_agent_id_t agent);
// Get the dimensions of a counter
std::vector<rocprofiler_record_dimension_info_t> get_counter_dimensions(
static std::vector<rocprofiler_record_dimension_info_t> get_counter_dimensions(
rocprofiler_counter_id_t counter);
};
@@ -161,18 +168,18 @@ counter_sampler::counter_sampler(rocprofiler_agent_id_t agent)
const std::string&
counter_sampler::decode_record_name(const rocprofiler_record_counter_t& rec) const
{
static auto roc_counters = [this]() {
if(id_to_name_.empty())
{
auto name_to_id = counter_sampler::get_supported_counters(agent_);
std::map<uint64_t, std::string> id_to_name;
for(const auto& [name, id] : name_to_id)
{
id_to_name.emplace(id.handle, name);
id_to_name_.emplace(id.handle, name);
}
return id_to_name;
}();
}
rocprofiler_counter_id_t counter_id = {.handle = 0};
rocprofiler_query_record_counter_id(rec.id, &counter_id);
return roc_counters.at(counter_id.handle);
return id_to_name_.at(counter_id.handle);
}
std::unordered_map<std::string, size_t>
@@ -353,11 +360,22 @@ exit_toggle()
static std::atomic<bool> exit_toggle = false;
return exit_toggle;
}
rocprofiler_client_finalize_t finalize = nullptr;
rocprofiler_client_id_t* client_id = nullptr;
std::shared_ptr<counter_sampler> sampler = {};
std::thread* sampler_thread = nullptr;
} // namespace
int
tool_init(rocprofiler_client_finalize_t, void*)
tool_init(rocprofiler_client_finalize_t fini_func, void*)
{
finalize = fini_func;
std::atexit([]() {
if(client_id) finalize(*client_id);
});
// Get the agents available on the device
auto agents = counter_sampler::get_available_agents();
if(agents.empty())
@@ -367,23 +385,25 @@ tool_init(rocprofiler_client_finalize_t, void*)
}
// Use the first agent found
std::shared_ptr<counter_sampler> sampler = std::make_shared<counter_sampler>(agents[0].id);
sampler = std::make_shared<counter_sampler>(agents[0].id);
std::thread([=]() {
sampler_thread = new std::thread{[=]() {
size_t count = 1;
std::vector<rocprofiler_record_counter_t> records;
while(exit_toggle().load() == false)
while(sampler && exit_toggle().load() == false)
{
sampler->sample_counter_values({"SQ_WAVES"}, records);
std::clog << "Sample " << count << ":\n";
for(const auto& record : records)
{
std::clog << "\tCounter: " << record.id
<< " Name: " << sampler->decode_record_name(record)
if(!sampler) break;
auto recname = sampler->decode_record_name(record);
std::clog << "\tCounter: " << record.id << " Name: " << recname
<< " Value: " << record.counter_value
<< " User data: " << record.user_data.value << "\n";
if(count == 1)
{
if(!sampler) break;
auto dims = sampler->get_record_dimensions(record);
for(const auto& [name, pos] : dims)
{
@@ -395,7 +415,7 @@ tool_init(rocprofiler_client_finalize_t, void*)
std::this_thread::sleep_for(std::chrono::milliseconds(50));
}
exit_toggle().store(false);
}).detach();
}};
// no errors
return 0;
@@ -404,13 +424,23 @@ tool_init(rocprofiler_client_finalize_t, void*)
void
tool_fini(void* user_data)
{
client_id = nullptr;
exit_toggle().store(true);
while(exit_toggle().load() == true)
{};
sampler->stop();
sampler->flush();
sampler_thread->join();
auto* output_stream = static_cast<std::ostream*>(user_data);
*output_stream << std::flush;
if(output_stream != &std::cout && output_stream != &std::cerr) delete output_stream;
sampler.reset();
delete sampler_thread;
}
extern "C" rocprofiler_tool_configure_result_t*
@@ -420,7 +450,8 @@ rocprofiler_configure(uint32_t version,
rocprofiler_client_id_t* id)
{
// set the client name
id->name = "CounterClientSample";
id->name = "CounterClientSample";
client_id = id;
// compute major/minor/patch version info
uint32_t major = version / 10000;
+1 -12
Datei anzeigen
@@ -139,19 +139,8 @@ set_profiler_active_on_queue(hsa_amd_memory_pool_t pool,
const size_t mask = 0x1000 - 1;
auto size = (profile.command_buffer.size + mask) & ~mask;
#define HSA_AMD_INTERFACE_VERSION \
ROCPROFILER_COMPUTE_VERSION(HSA_AMD_INTERFACE_VERSION_MAJOR, HSA_AMD_INTERFACE_VERSION_MINOR, 0)
#if HSA_AMD_INTERFACE_VERSION >= 10700
constexpr auto hsa_amd_memory_pool_executable_flag = HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG;
#elif HSA_AMD_INTERFACE_VERSION == 10600
constexpr auto hsa_amd_memory_pool_executable_flag = (1 << 2);
#else
constexpr auto hsa_amd_memory_pool_executable_flag = 0;
#endif
if(hsa::get_amd_ext_table()->hsa_amd_memory_pool_allocate_fn(
pool, size, hsa_amd_memory_pool_executable_flag, &profile.command_buffer.ptr) !=
pool, size, hsa::hsa_amd_memory_pool_executable_flag, &profile.command_buffer.ptr) !=
HSA_STATUS_SUCCESS)
{
ROCP_WARNING << "Failed to allocate memory to enable profile command on agent, some "
@@ -172,6 +172,20 @@ get_active_contexts(context_filter_t filter)
return data;
}
const context*
get_active_context(rocprofiler_context_id_t id)
{
if(get_num_active_contexts().load(std::memory_order_acquire) > 0)
{
for(auto& itr : get_active_contexts_impl())
{
const auto* ctx = itr.load(std::memory_order_acquire);
if(ctx && ctx->context_idx == id.handle) return ctx;
}
}
return nullptr;
}
// set the client index needs to be called before allocate_context()
void
push_client(uint32_t value)
@@ -199,6 +199,9 @@ get_active_contexts(context_array_t& data, context_filter_t filter = default_con
context_array_t
get_active_contexts(context_filter_t filter = default_context_filter);
const context*
get_active_context(rocprofiler_context_id_t id);
/// \brief disable the contexturation.
rocprofiler_status_t
stop_client_contexts(rocprofiler_client_id_t id);
@@ -25,10 +25,16 @@
#include "lib/rocprofiler-sdk/context/context.hpp"
#include "lib/rocprofiler-sdk/counters/core.hpp"
#include "lib/rocprofiler-sdk/counters/device_counting.hpp"
#include "lib/rocprofiler-sdk/registration.hpp"
#include "rocprofiler-sdk/fwd.h"
#include <string.h>
namespace
{
constexpr auto rocprofiler_context_none = ROCPROFILER_CONTEXT_NONE;
}
extern "C" {
rocprofiler_status_t
rocprofiler_configure_device_counting_service(rocprofiler_context_id_t context_id,
@@ -48,14 +54,26 @@ rocprofiler_sample_device_counting_service(rocprofiler_context_id_t context
rocprofiler_record_counter_t* output_records,
size_t* rec_count)
{
if(context_id == rocprofiler_context_none) return ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_FOUND;
// if finalized or finalizing, ignore request
if(rocprofiler::registration::get_fini_status() != 0) return ROCPROFILER_STATUS_ERROR_FINALIZED;
// capture the active context status now
const auto* ctx = rocprofiler::context::get_active_context(context_id);
// do not proceed if context has not been started
if(!ctx) return ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_STARTED;
if(output_records != nullptr)
{
if((flags & ROCPROFILER_COUNTER_FLAG_ASYNC) != 0)
if(!rec_count || (flags & ROCPROFILER_COUNTER_FLAG_ASYNC) != 0)
return ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT;
CHECK(rec_count);
if(*rec_count == 0) return ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES;
auto recs = std::vector<rocprofiler_record_counter_t>{};
auto status = rocprofiler::counters::read_agent_ctx(
rocprofiler::context::get_registered_context(context_id), user_data, flags, &recs);
auto status = rocprofiler::counters::read_agent_ctx(ctx, user_data, flags, &recs);
if(status == ROCPROFILER_STATUS_SUCCESS)
{
if(recs.size() > *rec_count)
@@ -70,7 +88,6 @@ rocprofiler_sample_device_counting_service(rocprofiler_context_id_t context
return status;
}
return rocprofiler::counters::read_agent_ctx(
rocprofiler::context::get_registered_context(context_id), user_data, flags, nullptr);
return rocprofiler::counters::read_agent_ctx(ctx, user_data, flags, nullptr);
}
}
@@ -57,11 +57,16 @@ CounterAQLPacket::CounterMemoryPool::Alloc(void** ptr, size_t size, desc_t flags
hsa_status_t status;
if(!pool.bIgnoreKernArg && flags.memory_hint == AQLPROFILE_MEMORY_HINT_DEVICE_UNCACHED)
status = pool.allocate_fn(pool.kernarg_pool_, size, 0, ptr);
status =
pool.allocate_fn(pool.kernarg_pool_, size, hsa_amd_memory_pool_executable_flag, ptr);
else
status = pool.allocate_fn(pool.cpu_pool_, size, 0, ptr);
status = pool.allocate_fn(pool.cpu_pool_, size, hsa_amd_memory_pool_executable_flag, ptr);
if(status != HSA_STATUS_SUCCESS) return status;
if(status != HSA_STATUS_SUCCESS)
{
ROCP_FATAL << "Could not allocate memory";
return status;
}
status = pool.fill_fn(*ptr, 0u, size / sizeof(uint32_t));
if(status != HSA_STATUS_SUCCESS) return status;
@@ -149,7 +154,7 @@ TraceMemoryPool::Alloc(void** ptr, size_t size, desc_t flags, void* data)
hsa_status_t status = HSA_STATUS_ERROR;
if(flags.host_access)
{
status = pool.allocate_fn(pool.cpu_pool_, size, 0, ptr);
status = pool.allocate_fn(pool.cpu_pool_, size, hsa_amd_memory_pool_executable_flag, ptr);
if(status == HSA_STATUS_SUCCESS)
status = pool.allow_access_fn(1, &pool.gpu_agent, nullptr, *ptr);
@@ -157,7 +162,8 @@ TraceMemoryPool::Alloc(void** ptr, size_t size, desc_t flags, void* data)
else
{
// Return page aligned data to avoid cache flush overlap
status = pool.allocate_fn(pool.gpu_pool_, size + 0x2000, 0, ptr);
status = pool.allocate_fn(
pool.gpu_pool_, size + 0x2000, hsa_amd_memory_pool_executable_flag, ptr);
*ptr = (void*) ((uintptr_t(*ptr) + 0xFFF) & ~0xFFFul); // NOLINT(performance-no-int-to-ptr)
}
return status;
@@ -40,6 +40,15 @@ class ThreadTraceAQLPacketFactory;
namespace hsa
{
#define HSA_AMD_INTERFACE_VERSION \
ROCPROFILER_COMPUTE_VERSION(HSA_AMD_INTERFACE_VERSION_MAJOR, HSA_AMD_INTERFACE_VERSION_MINOR, 0)
#if HSA_AMD_INTERFACE_VERSION >= 10700
constexpr auto hsa_amd_memory_pool_executable_flag = HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG;
#else
constexpr auto hsa_amd_memory_pool_executable_flag = (1 << 2);
#endif
constexpr hsa_ext_amd_aql_pm4_packet_t null_amd_aql_pm4_packet = {
.header = 0,
.pm4_command = {0},
@@ -19,6 +19,10 @@ endif()
project(rocprofiler-tests-bin-reproducible-runtime LANGUAGES CXX HIP)
if(NOT CMAKE_BUILD_TYPE MATCHES "(Release|RelWithDebInfo)")
set(CMAKE_BUILD_TYPE "RelWithDebInfo")
endif()
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}")
@@ -58,7 +58,7 @@ namespace
using auto_lock_t = std::unique_lock<std::mutex>;
auto print_lock = std::mutex{};
double nruntime = 500.0; // ms
uint32_t nspin = 256 * 10000;
uint32_t nspin = 128 * 10000;
size_t nthreads = 2;
void
@@ -144,7 +144,7 @@ run(int tid, int devid)
do
{
roctxMark("iteration");
uint32_t cyclesleft = 2000 * 1000 * (nruntime - static_cast<double>(time));
uint32_t cyclesleft = 1000 * 1000 * (nruntime - static_cast<double>(time));
HIP_API_CALL(hipEventRecord(start, stream));
reproducible_runtime<<<grid, block, 0, stream>>>(std::min<uint32_t>(nspin, cyclesleft));
HIP_API_CALL(hipEventRecord(stop, stream));