[SWDEV-513658] Force HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG value to be used with HSA calls (#192)
* Force HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG value to be used with HSA calls Fix for CI * More tweaks * Increase reproducible-runtime kernel sleep granularity * Fix data race in synchronous device counter collection sample * Update device counting service - add get_active_context function --------- Co-authored-by: Benjamin Welton <bewelton@amd.com> Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com>
Dieser Commit ist enthalten in:
committet von
GitHub
Ursprung
e67a4451d8
Commit
080b2ba451
@@ -24,6 +24,7 @@
|
||||
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
@@ -37,6 +38,8 @@
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include <rocprofiler-sdk/buffer.h>
|
||||
#include <rocprofiler-sdk/context.h>
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/registration.h>
|
||||
#include <rocprofiler-sdk/rocprofiler.h>
|
||||
@@ -79,7 +82,7 @@ public:
|
||||
|
||||
// Get the dimensions of a record (what CU/SE/etc the counter is for). High cost operation
|
||||
// should be cached if possible.
|
||||
std::unordered_map<std::string, size_t> get_record_dimensions(
|
||||
static std::unordered_map<std::string, size_t> get_record_dimensions(
|
||||
const rocprofiler_record_counter_t& rec);
|
||||
|
||||
// Sample the counter values for a set of counters, returns the records in the out parameter.
|
||||
@@ -89,6 +92,9 @@ public:
|
||||
// Get the available agents on the system
|
||||
static std::vector<rocprofiler_agent_v0_t> get_available_agents();
|
||||
|
||||
void flush() const { rocprofiler_flush_buffer(buf_); }
|
||||
void stop() const { rocprofiler_stop_context(ctx_); }
|
||||
|
||||
private:
|
||||
rocprofiler_agent_id_t agent_ = {};
|
||||
rocprofiler_context_id_t ctx_ = {};
|
||||
@@ -97,20 +103,21 @@ private:
|
||||
|
||||
std::map<std::vector<std::string>, rocprofiler_profile_config_id_t> cached_profiles_;
|
||||
std::map<uint64_t, uint64_t> profile_sizes_;
|
||||
mutable std::map<uint64_t, std::string> id_to_name_;
|
||||
|
||||
// Internal function used to set the profile for the agent when start_context is called
|
||||
void set_profile(rocprofiler_context_id_t ctx,
|
||||
rocprofiler_agent_set_profile_callback_t cb) const;
|
||||
|
||||
// Get the size of a counter in number of records
|
||||
size_t get_counter_size(rocprofiler_counter_id_t counter);
|
||||
static size_t get_counter_size(rocprofiler_counter_id_t counter);
|
||||
|
||||
// Get the supported counters for an agent
|
||||
static std::unordered_map<std::string, rocprofiler_counter_id_t> get_supported_counters(
|
||||
rocprofiler_agent_id_t agent);
|
||||
|
||||
// Get the dimensions of a counter
|
||||
std::vector<rocprofiler_record_dimension_info_t> get_counter_dimensions(
|
||||
static std::vector<rocprofiler_record_dimension_info_t> get_counter_dimensions(
|
||||
rocprofiler_counter_id_t counter);
|
||||
};
|
||||
|
||||
@@ -161,18 +168,18 @@ counter_sampler::counter_sampler(rocprofiler_agent_id_t agent)
|
||||
const std::string&
|
||||
counter_sampler::decode_record_name(const rocprofiler_record_counter_t& rec) const
|
||||
{
|
||||
static auto roc_counters = [this]() {
|
||||
if(id_to_name_.empty())
|
||||
{
|
||||
auto name_to_id = counter_sampler::get_supported_counters(agent_);
|
||||
std::map<uint64_t, std::string> id_to_name;
|
||||
for(const auto& [name, id] : name_to_id)
|
||||
{
|
||||
id_to_name.emplace(id.handle, name);
|
||||
id_to_name_.emplace(id.handle, name);
|
||||
}
|
||||
return id_to_name;
|
||||
}();
|
||||
}
|
||||
|
||||
rocprofiler_counter_id_t counter_id = {.handle = 0};
|
||||
rocprofiler_query_record_counter_id(rec.id, &counter_id);
|
||||
return roc_counters.at(counter_id.handle);
|
||||
return id_to_name_.at(counter_id.handle);
|
||||
}
|
||||
|
||||
std::unordered_map<std::string, size_t>
|
||||
@@ -353,11 +360,22 @@ exit_toggle()
|
||||
static std::atomic<bool> exit_toggle = false;
|
||||
return exit_toggle;
|
||||
}
|
||||
|
||||
rocprofiler_client_finalize_t finalize = nullptr;
|
||||
rocprofiler_client_id_t* client_id = nullptr;
|
||||
std::shared_ptr<counter_sampler> sampler = {};
|
||||
std::thread* sampler_thread = nullptr;
|
||||
} // namespace
|
||||
|
||||
int
|
||||
tool_init(rocprofiler_client_finalize_t, void*)
|
||||
tool_init(rocprofiler_client_finalize_t fini_func, void*)
|
||||
{
|
||||
finalize = fini_func;
|
||||
|
||||
std::atexit([]() {
|
||||
if(client_id) finalize(*client_id);
|
||||
});
|
||||
|
||||
// Get the agents available on the device
|
||||
auto agents = counter_sampler::get_available_agents();
|
||||
if(agents.empty())
|
||||
@@ -367,23 +385,25 @@ tool_init(rocprofiler_client_finalize_t, void*)
|
||||
}
|
||||
|
||||
// Use the first agent found
|
||||
std::shared_ptr<counter_sampler> sampler = std::make_shared<counter_sampler>(agents[0].id);
|
||||
sampler = std::make_shared<counter_sampler>(agents[0].id);
|
||||
|
||||
std::thread([=]() {
|
||||
sampler_thread = new std::thread{[=]() {
|
||||
size_t count = 1;
|
||||
std::vector<rocprofiler_record_counter_t> records;
|
||||
while(exit_toggle().load() == false)
|
||||
while(sampler && exit_toggle().load() == false)
|
||||
{
|
||||
sampler->sample_counter_values({"SQ_WAVES"}, records);
|
||||
std::clog << "Sample " << count << ":\n";
|
||||
for(const auto& record : records)
|
||||
{
|
||||
std::clog << "\tCounter: " << record.id
|
||||
<< " Name: " << sampler->decode_record_name(record)
|
||||
if(!sampler) break;
|
||||
auto recname = sampler->decode_record_name(record);
|
||||
std::clog << "\tCounter: " << record.id << " Name: " << recname
|
||||
<< " Value: " << record.counter_value
|
||||
<< " User data: " << record.user_data.value << "\n";
|
||||
if(count == 1)
|
||||
{
|
||||
if(!sampler) break;
|
||||
auto dims = sampler->get_record_dimensions(record);
|
||||
for(const auto& [name, pos] : dims)
|
||||
{
|
||||
@@ -395,7 +415,7 @@ tool_init(rocprofiler_client_finalize_t, void*)
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(50));
|
||||
}
|
||||
exit_toggle().store(false);
|
||||
}).detach();
|
||||
}};
|
||||
|
||||
// no errors
|
||||
return 0;
|
||||
@@ -404,13 +424,23 @@ tool_init(rocprofiler_client_finalize_t, void*)
|
||||
void
|
||||
tool_fini(void* user_data)
|
||||
{
|
||||
client_id = nullptr;
|
||||
|
||||
exit_toggle().store(true);
|
||||
while(exit_toggle().load() == true)
|
||||
{};
|
||||
|
||||
sampler->stop();
|
||||
sampler->flush();
|
||||
|
||||
sampler_thread->join();
|
||||
|
||||
auto* output_stream = static_cast<std::ostream*>(user_data);
|
||||
*output_stream << std::flush;
|
||||
if(output_stream != &std::cout && output_stream != &std::cerr) delete output_stream;
|
||||
|
||||
sampler.reset();
|
||||
delete sampler_thread;
|
||||
}
|
||||
|
||||
extern "C" rocprofiler_tool_configure_result_t*
|
||||
@@ -420,7 +450,8 @@ rocprofiler_configure(uint32_t version,
|
||||
rocprofiler_client_id_t* id)
|
||||
{
|
||||
// set the client name
|
||||
id->name = "CounterClientSample";
|
||||
id->name = "CounterClientSample";
|
||||
client_id = id;
|
||||
|
||||
// compute major/minor/patch version info
|
||||
uint32_t major = version / 10000;
|
||||
|
||||
@@ -139,19 +139,8 @@ set_profiler_active_on_queue(hsa_amd_memory_pool_t pool,
|
||||
const size_t mask = 0x1000 - 1;
|
||||
auto size = (profile.command_buffer.size + mask) & ~mask;
|
||||
|
||||
#define HSA_AMD_INTERFACE_VERSION \
|
||||
ROCPROFILER_COMPUTE_VERSION(HSA_AMD_INTERFACE_VERSION_MAJOR, HSA_AMD_INTERFACE_VERSION_MINOR, 0)
|
||||
|
||||
#if HSA_AMD_INTERFACE_VERSION >= 10700
|
||||
constexpr auto hsa_amd_memory_pool_executable_flag = HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG;
|
||||
#elif HSA_AMD_INTERFACE_VERSION == 10600
|
||||
constexpr auto hsa_amd_memory_pool_executable_flag = (1 << 2);
|
||||
#else
|
||||
constexpr auto hsa_amd_memory_pool_executable_flag = 0;
|
||||
#endif
|
||||
|
||||
if(hsa::get_amd_ext_table()->hsa_amd_memory_pool_allocate_fn(
|
||||
pool, size, hsa_amd_memory_pool_executable_flag, &profile.command_buffer.ptr) !=
|
||||
pool, size, hsa::hsa_amd_memory_pool_executable_flag, &profile.command_buffer.ptr) !=
|
||||
HSA_STATUS_SUCCESS)
|
||||
{
|
||||
ROCP_WARNING << "Failed to allocate memory to enable profile command on agent, some "
|
||||
|
||||
@@ -172,6 +172,20 @@ get_active_contexts(context_filter_t filter)
|
||||
return data;
|
||||
}
|
||||
|
||||
const context*
|
||||
get_active_context(rocprofiler_context_id_t id)
|
||||
{
|
||||
if(get_num_active_contexts().load(std::memory_order_acquire) > 0)
|
||||
{
|
||||
for(auto& itr : get_active_contexts_impl())
|
||||
{
|
||||
const auto* ctx = itr.load(std::memory_order_acquire);
|
||||
if(ctx && ctx->context_idx == id.handle) return ctx;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// set the client index needs to be called before allocate_context()
|
||||
void
|
||||
push_client(uint32_t value)
|
||||
|
||||
@@ -199,6 +199,9 @@ get_active_contexts(context_array_t& data, context_filter_t filter = default_con
|
||||
context_array_t
|
||||
get_active_contexts(context_filter_t filter = default_context_filter);
|
||||
|
||||
const context*
|
||||
get_active_context(rocprofiler_context_id_t id);
|
||||
|
||||
/// \brief disable the contexturation.
|
||||
rocprofiler_status_t
|
||||
stop_client_contexts(rocprofiler_client_id_t id);
|
||||
|
||||
@@ -25,10 +25,16 @@
|
||||
#include "lib/rocprofiler-sdk/context/context.hpp"
|
||||
#include "lib/rocprofiler-sdk/counters/core.hpp"
|
||||
#include "lib/rocprofiler-sdk/counters/device_counting.hpp"
|
||||
#include "lib/rocprofiler-sdk/registration.hpp"
|
||||
#include "rocprofiler-sdk/fwd.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
namespace
|
||||
{
|
||||
constexpr auto rocprofiler_context_none = ROCPROFILER_CONTEXT_NONE;
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
rocprofiler_status_t
|
||||
rocprofiler_configure_device_counting_service(rocprofiler_context_id_t context_id,
|
||||
@@ -48,14 +54,26 @@ rocprofiler_sample_device_counting_service(rocprofiler_context_id_t context
|
||||
rocprofiler_record_counter_t* output_records,
|
||||
size_t* rec_count)
|
||||
{
|
||||
if(context_id == rocprofiler_context_none) return ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_FOUND;
|
||||
|
||||
// if finalized or finalizing, ignore request
|
||||
if(rocprofiler::registration::get_fini_status() != 0) return ROCPROFILER_STATUS_ERROR_FINALIZED;
|
||||
|
||||
// capture the active context status now
|
||||
const auto* ctx = rocprofiler::context::get_active_context(context_id);
|
||||
|
||||
// do not proceed if context has not been started
|
||||
if(!ctx) return ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_STARTED;
|
||||
|
||||
if(output_records != nullptr)
|
||||
{
|
||||
if((flags & ROCPROFILER_COUNTER_FLAG_ASYNC) != 0)
|
||||
if(!rec_count || (flags & ROCPROFILER_COUNTER_FLAG_ASYNC) != 0)
|
||||
return ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
CHECK(rec_count);
|
||||
|
||||
if(*rec_count == 0) return ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
|
||||
auto recs = std::vector<rocprofiler_record_counter_t>{};
|
||||
auto status = rocprofiler::counters::read_agent_ctx(
|
||||
rocprofiler::context::get_registered_context(context_id), user_data, flags, &recs);
|
||||
auto status = rocprofiler::counters::read_agent_ctx(ctx, user_data, flags, &recs);
|
||||
if(status == ROCPROFILER_STATUS_SUCCESS)
|
||||
{
|
||||
if(recs.size() > *rec_count)
|
||||
@@ -70,7 +88,6 @@ rocprofiler_sample_device_counting_service(rocprofiler_context_id_t context
|
||||
return status;
|
||||
}
|
||||
|
||||
return rocprofiler::counters::read_agent_ctx(
|
||||
rocprofiler::context::get_registered_context(context_id), user_data, flags, nullptr);
|
||||
return rocprofiler::counters::read_agent_ctx(ctx, user_data, flags, nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -57,11 +57,16 @@ CounterAQLPacket::CounterMemoryPool::Alloc(void** ptr, size_t size, desc_t flags
|
||||
|
||||
hsa_status_t status;
|
||||
if(!pool.bIgnoreKernArg && flags.memory_hint == AQLPROFILE_MEMORY_HINT_DEVICE_UNCACHED)
|
||||
status = pool.allocate_fn(pool.kernarg_pool_, size, 0, ptr);
|
||||
status =
|
||||
pool.allocate_fn(pool.kernarg_pool_, size, hsa_amd_memory_pool_executable_flag, ptr);
|
||||
else
|
||||
status = pool.allocate_fn(pool.cpu_pool_, size, 0, ptr);
|
||||
status = pool.allocate_fn(pool.cpu_pool_, size, hsa_amd_memory_pool_executable_flag, ptr);
|
||||
|
||||
if(status != HSA_STATUS_SUCCESS) return status;
|
||||
if(status != HSA_STATUS_SUCCESS)
|
||||
{
|
||||
ROCP_FATAL << "Could not allocate memory";
|
||||
return status;
|
||||
}
|
||||
|
||||
status = pool.fill_fn(*ptr, 0u, size / sizeof(uint32_t));
|
||||
if(status != HSA_STATUS_SUCCESS) return status;
|
||||
@@ -149,7 +154,7 @@ TraceMemoryPool::Alloc(void** ptr, size_t size, desc_t flags, void* data)
|
||||
hsa_status_t status = HSA_STATUS_ERROR;
|
||||
if(flags.host_access)
|
||||
{
|
||||
status = pool.allocate_fn(pool.cpu_pool_, size, 0, ptr);
|
||||
status = pool.allocate_fn(pool.cpu_pool_, size, hsa_amd_memory_pool_executable_flag, ptr);
|
||||
|
||||
if(status == HSA_STATUS_SUCCESS)
|
||||
status = pool.allow_access_fn(1, &pool.gpu_agent, nullptr, *ptr);
|
||||
@@ -157,7 +162,8 @@ TraceMemoryPool::Alloc(void** ptr, size_t size, desc_t flags, void* data)
|
||||
else
|
||||
{
|
||||
// Return page aligned data to avoid cache flush overlap
|
||||
status = pool.allocate_fn(pool.gpu_pool_, size + 0x2000, 0, ptr);
|
||||
status = pool.allocate_fn(
|
||||
pool.gpu_pool_, size + 0x2000, hsa_amd_memory_pool_executable_flag, ptr);
|
||||
*ptr = (void*) ((uintptr_t(*ptr) + 0xFFF) & ~0xFFFul); // NOLINT(performance-no-int-to-ptr)
|
||||
}
|
||||
return status;
|
||||
|
||||
@@ -40,6 +40,15 @@ class ThreadTraceAQLPacketFactory;
|
||||
|
||||
namespace hsa
|
||||
{
|
||||
#define HSA_AMD_INTERFACE_VERSION \
|
||||
ROCPROFILER_COMPUTE_VERSION(HSA_AMD_INTERFACE_VERSION_MAJOR, HSA_AMD_INTERFACE_VERSION_MINOR, 0)
|
||||
|
||||
#if HSA_AMD_INTERFACE_VERSION >= 10700
|
||||
constexpr auto hsa_amd_memory_pool_executable_flag = HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG;
|
||||
#else
|
||||
constexpr auto hsa_amd_memory_pool_executable_flag = (1 << 2);
|
||||
#endif
|
||||
|
||||
constexpr hsa_ext_amd_aql_pm4_packet_t null_amd_aql_pm4_packet = {
|
||||
.header = 0,
|
||||
.pm4_command = {0},
|
||||
|
||||
@@ -19,6 +19,10 @@ endif()
|
||||
|
||||
project(rocprofiler-tests-bin-reproducible-runtime LANGUAGES CXX HIP)
|
||||
|
||||
if(NOT CMAKE_BUILD_TYPE MATCHES "(Release|RelWithDebInfo)")
|
||||
set(CMAKE_BUILD_TYPE "RelWithDebInfo")
|
||||
endif()
|
||||
|
||||
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
|
||||
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
|
||||
set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}")
|
||||
|
||||
@@ -58,7 +58,7 @@ namespace
|
||||
using auto_lock_t = std::unique_lock<std::mutex>;
|
||||
auto print_lock = std::mutex{};
|
||||
double nruntime = 500.0; // ms
|
||||
uint32_t nspin = 256 * 10000;
|
||||
uint32_t nspin = 128 * 10000;
|
||||
size_t nthreads = 2;
|
||||
|
||||
void
|
||||
@@ -144,7 +144,7 @@ run(int tid, int devid)
|
||||
do
|
||||
{
|
||||
roctxMark("iteration");
|
||||
uint32_t cyclesleft = 2000 * 1000 * (nruntime - static_cast<double>(time));
|
||||
uint32_t cyclesleft = 1000 * 1000 * (nruntime - static_cast<double>(time));
|
||||
HIP_API_CALL(hipEventRecord(start, stream));
|
||||
reproducible_runtime<<<grid, block, 0, stream>>>(std::min<uint32_t>(nspin, cyclesleft));
|
||||
HIP_API_CALL(hipEventRecord(stop, stream));
|
||||
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren