diff --git a/samples/counter_collection/device_counting_synchronous.cpp b/samples/counter_collection/device_counting_synchronous.cpp index 252628ad3d..80e24dc5af 100644 --- a/samples/counter_collection/device_counting_synchronous.cpp +++ b/samples/counter_collection/device_counting_synchronous.cpp @@ -24,6 +24,7 @@ #include #include +#include #include #include #include @@ -37,6 +38,8 @@ #include #include +#include +#include #include #include #include @@ -79,7 +82,7 @@ public: // Get the dimensions of a record (what CU/SE/etc the counter is for). High cost operation // should be cached if possible. - std::unordered_map get_record_dimensions( + static std::unordered_map get_record_dimensions( const rocprofiler_record_counter_t& rec); // Sample the counter values for a set of counters, returns the records in the out parameter. @@ -89,6 +92,9 @@ public: // Get the available agents on the system static std::vector get_available_agents(); + void flush() const { rocprofiler_flush_buffer(buf_); } + void stop() const { rocprofiler_stop_context(ctx_); } + private: rocprofiler_agent_id_t agent_ = {}; rocprofiler_context_id_t ctx_ = {}; @@ -97,20 +103,21 @@ private: std::map, rocprofiler_profile_config_id_t> cached_profiles_; std::map profile_sizes_; + mutable std::map id_to_name_; // Internal function used to set the profile for the agent when start_context is called void set_profile(rocprofiler_context_id_t ctx, rocprofiler_agent_set_profile_callback_t cb) const; // Get the size of a counter in number of records - size_t get_counter_size(rocprofiler_counter_id_t counter); + static size_t get_counter_size(rocprofiler_counter_id_t counter); // Get the supported counters for an agent static std::unordered_map get_supported_counters( rocprofiler_agent_id_t agent); // Get the dimensions of a counter - std::vector get_counter_dimensions( + static std::vector get_counter_dimensions( rocprofiler_counter_id_t counter); }; @@ -161,18 +168,18 @@ counter_sampler::counter_sampler(rocprofiler_agent_id_t agent) const std::string& counter_sampler::decode_record_name(const rocprofiler_record_counter_t& rec) const { - static auto roc_counters = [this]() { + if(id_to_name_.empty()) + { auto name_to_id = counter_sampler::get_supported_counters(agent_); - std::map id_to_name; for(const auto& [name, id] : name_to_id) { - id_to_name.emplace(id.handle, name); + id_to_name_.emplace(id.handle, name); } - return id_to_name; - }(); + } + rocprofiler_counter_id_t counter_id = {.handle = 0}; rocprofiler_query_record_counter_id(rec.id, &counter_id); - return roc_counters.at(counter_id.handle); + return id_to_name_.at(counter_id.handle); } std::unordered_map @@ -353,11 +360,22 @@ exit_toggle() static std::atomic exit_toggle = false; return exit_toggle; } + +rocprofiler_client_finalize_t finalize = nullptr; +rocprofiler_client_id_t* client_id = nullptr; +std::shared_ptr sampler = {}; +std::thread* sampler_thread = nullptr; } // namespace int -tool_init(rocprofiler_client_finalize_t, void*) +tool_init(rocprofiler_client_finalize_t fini_func, void*) { + finalize = fini_func; + + std::atexit([]() { + if(client_id) finalize(*client_id); + }); + // Get the agents available on the device auto agents = counter_sampler::get_available_agents(); if(agents.empty()) @@ -367,23 +385,25 @@ tool_init(rocprofiler_client_finalize_t, void*) } // Use the first agent found - std::shared_ptr sampler = std::make_shared(agents[0].id); + sampler = std::make_shared(agents[0].id); - std::thread([=]() { + sampler_thread = new std::thread{[=]() { size_t count = 1; std::vector records; - while(exit_toggle().load() == false) + while(sampler && exit_toggle().load() == false) { sampler->sample_counter_values({"SQ_WAVES"}, records); std::clog << "Sample " << count << ":\n"; for(const auto& record : records) { - std::clog << "\tCounter: " << record.id - << " Name: " << sampler->decode_record_name(record) + if(!sampler) break; + auto recname = sampler->decode_record_name(record); + std::clog << "\tCounter: " << record.id << " Name: " << recname << " Value: " << record.counter_value << " User data: " << record.user_data.value << "\n"; if(count == 1) { + if(!sampler) break; auto dims = sampler->get_record_dimensions(record); for(const auto& [name, pos] : dims) { @@ -395,7 +415,7 @@ tool_init(rocprofiler_client_finalize_t, void*) std::this_thread::sleep_for(std::chrono::milliseconds(50)); } exit_toggle().store(false); - }).detach(); + }}; // no errors return 0; @@ -404,13 +424,23 @@ tool_init(rocprofiler_client_finalize_t, void*) void tool_fini(void* user_data) { + client_id = nullptr; + exit_toggle().store(true); while(exit_toggle().load() == true) {}; + sampler->stop(); + sampler->flush(); + + sampler_thread->join(); + auto* output_stream = static_cast(user_data); *output_stream << std::flush; if(output_stream != &std::cout && output_stream != &std::cerr) delete output_stream; + + sampler.reset(); + delete sampler_thread; } extern "C" rocprofiler_tool_configure_result_t* @@ -420,7 +450,8 @@ rocprofiler_configure(uint32_t version, rocprofiler_client_id_t* id) { // set the client name - id->name = "CounterClientSample"; + id->name = "CounterClientSample"; + client_id = id; // compute major/minor/patch version info uint32_t major = version / 10000; diff --git a/source/lib/rocprofiler-sdk/aql/helpers.cpp b/source/lib/rocprofiler-sdk/aql/helpers.cpp index 44c4283a11..35842b1233 100644 --- a/source/lib/rocprofiler-sdk/aql/helpers.cpp +++ b/source/lib/rocprofiler-sdk/aql/helpers.cpp @@ -139,19 +139,8 @@ set_profiler_active_on_queue(hsa_amd_memory_pool_t pool, const size_t mask = 0x1000 - 1; auto size = (profile.command_buffer.size + mask) & ~mask; -#define HSA_AMD_INTERFACE_VERSION \ - ROCPROFILER_COMPUTE_VERSION(HSA_AMD_INTERFACE_VERSION_MAJOR, HSA_AMD_INTERFACE_VERSION_MINOR, 0) - -#if HSA_AMD_INTERFACE_VERSION >= 10700 - constexpr auto hsa_amd_memory_pool_executable_flag = HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG; -#elif HSA_AMD_INTERFACE_VERSION == 10600 - constexpr auto hsa_amd_memory_pool_executable_flag = (1 << 2); -#else - constexpr auto hsa_amd_memory_pool_executable_flag = 0; -#endif - if(hsa::get_amd_ext_table()->hsa_amd_memory_pool_allocate_fn( - pool, size, hsa_amd_memory_pool_executable_flag, &profile.command_buffer.ptr) != + pool, size, hsa::hsa_amd_memory_pool_executable_flag, &profile.command_buffer.ptr) != HSA_STATUS_SUCCESS) { ROCP_WARNING << "Failed to allocate memory to enable profile command on agent, some " diff --git a/source/lib/rocprofiler-sdk/context/context.cpp b/source/lib/rocprofiler-sdk/context/context.cpp index 3f8eeb78bf..1a11be653e 100644 --- a/source/lib/rocprofiler-sdk/context/context.cpp +++ b/source/lib/rocprofiler-sdk/context/context.cpp @@ -172,6 +172,20 @@ get_active_contexts(context_filter_t filter) return data; } +const context* +get_active_context(rocprofiler_context_id_t id) +{ + if(get_num_active_contexts().load(std::memory_order_acquire) > 0) + { + for(auto& itr : get_active_contexts_impl()) + { + const auto* ctx = itr.load(std::memory_order_acquire); + if(ctx && ctx->context_idx == id.handle) return ctx; + } + } + return nullptr; +} + // set the client index needs to be called before allocate_context() void push_client(uint32_t value) diff --git a/source/lib/rocprofiler-sdk/context/context.hpp b/source/lib/rocprofiler-sdk/context/context.hpp index 0bdc4ad356..e7bcf5160c 100644 --- a/source/lib/rocprofiler-sdk/context/context.hpp +++ b/source/lib/rocprofiler-sdk/context/context.hpp @@ -199,6 +199,9 @@ get_active_contexts(context_array_t& data, context_filter_t filter = default_con context_array_t get_active_contexts(context_filter_t filter = default_context_filter); +const context* +get_active_context(rocprofiler_context_id_t id); + /// \brief disable the contexturation. rocprofiler_status_t stop_client_contexts(rocprofiler_client_id_t id); diff --git a/source/lib/rocprofiler-sdk/device_counting_service.cpp b/source/lib/rocprofiler-sdk/device_counting_service.cpp index 25aba246cc..89e4d680df 100644 --- a/source/lib/rocprofiler-sdk/device_counting_service.cpp +++ b/source/lib/rocprofiler-sdk/device_counting_service.cpp @@ -25,10 +25,16 @@ #include "lib/rocprofiler-sdk/context/context.hpp" #include "lib/rocprofiler-sdk/counters/core.hpp" #include "lib/rocprofiler-sdk/counters/device_counting.hpp" +#include "lib/rocprofiler-sdk/registration.hpp" #include "rocprofiler-sdk/fwd.h" #include +namespace +{ +constexpr auto rocprofiler_context_none = ROCPROFILER_CONTEXT_NONE; +} + extern "C" { rocprofiler_status_t rocprofiler_configure_device_counting_service(rocprofiler_context_id_t context_id, @@ -48,14 +54,26 @@ rocprofiler_sample_device_counting_service(rocprofiler_context_id_t context rocprofiler_record_counter_t* output_records, size_t* rec_count) { + if(context_id == rocprofiler_context_none) return ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_FOUND; + + // if finalized or finalizing, ignore request + if(rocprofiler::registration::get_fini_status() != 0) return ROCPROFILER_STATUS_ERROR_FINALIZED; + + // capture the active context status now + const auto* ctx = rocprofiler::context::get_active_context(context_id); + + // do not proceed if context has not been started + if(!ctx) return ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_STARTED; + if(output_records != nullptr) { - if((flags & ROCPROFILER_COUNTER_FLAG_ASYNC) != 0) + if(!rec_count || (flags & ROCPROFILER_COUNTER_FLAG_ASYNC) != 0) return ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT; - CHECK(rec_count); + + if(*rec_count == 0) return ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES; + auto recs = std::vector{}; - auto status = rocprofiler::counters::read_agent_ctx( - rocprofiler::context::get_registered_context(context_id), user_data, flags, &recs); + auto status = rocprofiler::counters::read_agent_ctx(ctx, user_data, flags, &recs); if(status == ROCPROFILER_STATUS_SUCCESS) { if(recs.size() > *rec_count) @@ -70,7 +88,6 @@ rocprofiler_sample_device_counting_service(rocprofiler_context_id_t context return status; } - return rocprofiler::counters::read_agent_ctx( - rocprofiler::context::get_registered_context(context_id), user_data, flags, nullptr); + return rocprofiler::counters::read_agent_ctx(ctx, user_data, flags, nullptr); } } diff --git a/source/lib/rocprofiler-sdk/hsa/aql_packet.cpp b/source/lib/rocprofiler-sdk/hsa/aql_packet.cpp index 39f7258645..f1c0c3a56e 100644 --- a/source/lib/rocprofiler-sdk/hsa/aql_packet.cpp +++ b/source/lib/rocprofiler-sdk/hsa/aql_packet.cpp @@ -57,11 +57,16 @@ CounterAQLPacket::CounterMemoryPool::Alloc(void** ptr, size_t size, desc_t flags hsa_status_t status; if(!pool.bIgnoreKernArg && flags.memory_hint == AQLPROFILE_MEMORY_HINT_DEVICE_UNCACHED) - status = pool.allocate_fn(pool.kernarg_pool_, size, 0, ptr); + status = + pool.allocate_fn(pool.kernarg_pool_, size, hsa_amd_memory_pool_executable_flag, ptr); else - status = pool.allocate_fn(pool.cpu_pool_, size, 0, ptr); + status = pool.allocate_fn(pool.cpu_pool_, size, hsa_amd_memory_pool_executable_flag, ptr); - if(status != HSA_STATUS_SUCCESS) return status; + if(status != HSA_STATUS_SUCCESS) + { + ROCP_FATAL << "Could not allocate memory"; + return status; + } status = pool.fill_fn(*ptr, 0u, size / sizeof(uint32_t)); if(status != HSA_STATUS_SUCCESS) return status; @@ -149,7 +154,7 @@ TraceMemoryPool::Alloc(void** ptr, size_t size, desc_t flags, void* data) hsa_status_t status = HSA_STATUS_ERROR; if(flags.host_access) { - status = pool.allocate_fn(pool.cpu_pool_, size, 0, ptr); + status = pool.allocate_fn(pool.cpu_pool_, size, hsa_amd_memory_pool_executable_flag, ptr); if(status == HSA_STATUS_SUCCESS) status = pool.allow_access_fn(1, &pool.gpu_agent, nullptr, *ptr); @@ -157,7 +162,8 @@ TraceMemoryPool::Alloc(void** ptr, size_t size, desc_t flags, void* data) else { // Return page aligned data to avoid cache flush overlap - status = pool.allocate_fn(pool.gpu_pool_, size + 0x2000, 0, ptr); + status = pool.allocate_fn( + pool.gpu_pool_, size + 0x2000, hsa_amd_memory_pool_executable_flag, ptr); *ptr = (void*) ((uintptr_t(*ptr) + 0xFFF) & ~0xFFFul); // NOLINT(performance-no-int-to-ptr) } return status; diff --git a/source/lib/rocprofiler-sdk/hsa/aql_packet.hpp b/source/lib/rocprofiler-sdk/hsa/aql_packet.hpp index 534da907d6..600e2d20a3 100644 --- a/source/lib/rocprofiler-sdk/hsa/aql_packet.hpp +++ b/source/lib/rocprofiler-sdk/hsa/aql_packet.hpp @@ -40,6 +40,15 @@ class ThreadTraceAQLPacketFactory; namespace hsa { +#define HSA_AMD_INTERFACE_VERSION \ + ROCPROFILER_COMPUTE_VERSION(HSA_AMD_INTERFACE_VERSION_MAJOR, HSA_AMD_INTERFACE_VERSION_MINOR, 0) + +#if HSA_AMD_INTERFACE_VERSION >= 10700 +constexpr auto hsa_amd_memory_pool_executable_flag = HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG; +#else +constexpr auto hsa_amd_memory_pool_executable_flag = (1 << 2); +#endif + constexpr hsa_ext_amd_aql_pm4_packet_t null_amd_aql_pm4_packet = { .header = 0, .pm4_command = {0}, diff --git a/tests/bin/reproducible-runtime/CMakeLists.txt b/tests/bin/reproducible-runtime/CMakeLists.txt index 5302eeb194..cec21661dd 100644 --- a/tests/bin/reproducible-runtime/CMakeLists.txt +++ b/tests/bin/reproducible-runtime/CMakeLists.txt @@ -19,6 +19,10 @@ endif() project(rocprofiler-tests-bin-reproducible-runtime LANGUAGES CXX HIP) +if(NOT CMAKE_BUILD_TYPE MATCHES "(Release|RelWithDebInfo)") + set(CMAKE_BUILD_TYPE "RelWithDebInfo") +endif() + foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "") set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}") diff --git a/tests/bin/reproducible-runtime/reproducible-runtime.cpp b/tests/bin/reproducible-runtime/reproducible-runtime.cpp index 7514bb7eaa..29a3fba621 100644 --- a/tests/bin/reproducible-runtime/reproducible-runtime.cpp +++ b/tests/bin/reproducible-runtime/reproducible-runtime.cpp @@ -58,7 +58,7 @@ namespace using auto_lock_t = std::unique_lock; auto print_lock = std::mutex{}; double nruntime = 500.0; // ms -uint32_t nspin = 256 * 10000; +uint32_t nspin = 128 * 10000; size_t nthreads = 2; void @@ -144,7 +144,7 @@ run(int tid, int devid) do { roctxMark("iteration"); - uint32_t cyclesleft = 2000 * 1000 * (nruntime - static_cast(time)); + uint32_t cyclesleft = 1000 * 1000 * (nruntime - static_cast(time)); HIP_API_CALL(hipEventRecord(start, stream)); reproducible_runtime<<>>(std::min(nspin, cyclesleft)); HIP_API_CALL(hipEventRecord(stop, stream));