From bcdecc7ff43fd44ad7a5c9cedd162fb48653b1ba Mon Sep 17 00:00:00 2001 From: David Yat Sin Date: Sat, 23 Sep 2023 15:58:13 +0000 Subject: [PATCH] PC Sampling: Add start stop and flush APIs Create PC Sampling APIs for start and stop functions. And create stub for flush function. Change-Id: I7a093b29dc87e34ac06faaae6cac2be50e4663e1 [ROCm/ROCR-Runtime commit: a84224748222fabd925869dccedcf836bc8d877b] --- .../hsa-runtime/core/inc/amd_gpu_agent.h | 13 ++++ .../core/runtime/amd_gpu_agent.cpp | 76 +++++++++++++++++++ .../runtime/hsa-runtime/hsacore.so.def | 3 + .../pcs/hsa_ven_amd_pc_sampling.cpp | 18 +++++ .../pcs/inc/hsa_ven_amd_pc_sampling_impl.h | 6 ++ .../runtime/hsa-runtime/pcs/pcs_runtime.cpp | 37 ++++++++- .../runtime/hsa-runtime/pcs/pcs_runtime.h | 11 ++- 7 files changed, 161 insertions(+), 3 deletions(-) diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index a9902416d3..868a08dbe8 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -199,6 +199,12 @@ class GpuAgentInt : public core::Agent { pcs::PcsRuntime::PcSamplingSession& session) = 0; virtual hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session) = 0; + + virtual hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session) = 0; + + virtual hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session) = 0; + + virtual hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session) = 0; }; class GpuAgent : public GpuAgentInt { @@ -485,6 +491,12 @@ class GpuAgent : public GpuAgentInt { hsa_status_t PcSamplingCreateFromId(HsaPcSamplingTraceId pcsId, pcs::PcsRuntime::PcSamplingSession& session); hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session); + hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session); + hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session); + hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session); + + static void PcSamplingThreadRun(void* agent); + void PcSamplingThread(); // @brief Node properties. const HsaNodeProperties properties_; @@ -691,6 +703,7 @@ class GpuAgent : public GpuAgentInt { /* PC Sampling fields - begin */ typedef struct { + os::Thread thread; pcs::PcsRuntime::PcSamplingSession* session; } pcs_hosttrap_t; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index 77c2f0444d..7b230ba736 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -2400,6 +2400,8 @@ hsa_status_t GpuAgent::PcSamplingCreateFromId(HsaPcSamplingTraceId ioctlId, } hsa_status_t GpuAgent::PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session) { + if (PcSamplingStop(session) != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR; + pcs_hosttrap_t& ht_data = pcs_hosttrap_data_; HSAKMT_STATUS retKmt = hsaKmtPcSamplingDestroy(node_id(), session.ThunkId()); ht_data.session = NULL; @@ -2407,5 +2409,79 @@ hsa_status_t GpuAgent::PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& ses return (retKmt == HSAKMT_STATUS_SUCCESS) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; } +hsa_status_t GpuAgent::PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session) { + if (session.isActive()) return HSA_STATUS_SUCCESS; + + pcs_hosttrap_t& ht_data = pcs_hosttrap_data_; + + auto method = session.method(); + if (method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) { + if (ht_data.session->isActive()) { + debug_warning("Already have a Host trap session in progress!"); + return (hsa_status_t)HSA_STATUS_ERROR_RESOURCE_BUSY; + } + ht_data.session->start(); + // This thread will handle all hosttrap sessions on this agent + // In the future, there will be another thread to handle stochastic sessions. + ht_data.thread = os::CreateThread(PcSamplingThreadRun, (void*)this); + if (!ht_data.thread) + throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, + "Failed to start PC Sampling thread."); + } + + if (hsaKmtPcSamplingStart(node_id(), session.ThunkId()) == HSAKMT_STATUS_SUCCESS) + return HSA_STATUS_SUCCESS; + + debug_print("Failed to start PC sampling session with thunkId:%d\n", session.ThunkId()); + if (method == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) { + ht_data.session->stop(); + os::WaitForThread(ht_data.thread); + os::CloseThread(ht_data.thread); + ht_data.thread = NULL; + } + + return HSA_STATUS_ERROR; +} + +hsa_status_t GpuAgent::PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session) { + if (!session.isActive()) return HSA_STATUS_SUCCESS; + + pcs_hosttrap_t& ht_data = pcs_hosttrap_data_; + + session.stop(); + + HSAKMT_STATUS retKmt = hsaKmtPcSamplingStop(node_id(), session.ThunkId()); + if (retKmt != HSAKMT_STATUS_SUCCESS) + throw AMD::hsa_exception(HSA_STATUS_ERROR, "Failed to stop PC Sampling session."); + + if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1) { + os::WaitForThread(ht_data.thread); + os::CloseThread(ht_data.thread); + ht_data.thread = NULL; + } + + return HSA_STATUS_SUCCESS; +} + +void GpuAgent::PcSamplingThread() { + pcs_hosttrap_t& ht_data = pcs_hosttrap_data_; + while (ht_data.session->isActive()) { + // Implement code to read data from 2nd level trap handler here + sleep(1); + } + debug_print("PcSamplingThread::Exiting\n"); +} + +void GpuAgent::PcSamplingThreadRun(void* _agent) { + GpuAgent* agent = (GpuAgent*)_agent; + agent->PcSamplingThread(); + debug_print("PcSamplingThread exiting..."); +} + +hsa_status_t GpuAgent::PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session) { + // TODO: implement me + return HSA_STATUS_SUCCESS; +} + } // namespace amd } // namespace rocr diff --git a/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def b/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def index 24fa990b28..b7db01442f 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def +++ b/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def @@ -256,6 +256,9 @@ global: hsa_ven_amd_pcs_create; hsa_ven_amd_pcs_create_from_id; hsa_ven_amd_pcs_destroy; + hsa_ven_amd_pcs_start; + hsa_ven_amd_pcs_stop; + hsa_ven_amd_pcs_flush; local: *; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/pcs/hsa_ven_amd_pc_sampling.cpp b/projects/rocr-runtime/runtime/hsa-runtime/pcs/hsa_ven_amd_pc_sampling.cpp index 3cc2fb86e5..f57d7353ca 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/pcs/hsa_ven_amd_pc_sampling.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/pcs/hsa_ven_amd_pc_sampling.cpp @@ -134,6 +134,24 @@ hsa_status_t hsa_ven_amd_pcs_destroy(hsa_ven_amd_pcs_t handle) { CATCH; } +hsa_status_t hsa_ven_amd_pcs_start(hsa_ven_amd_pcs_t handle) { + TRY; + return PcsRuntime::instance()->PcSamplingStart(handle); + CATCH; +} + +hsa_status_t hsa_ven_amd_pcs_stop(hsa_ven_amd_pcs_t handle) { + TRY; + return PcsRuntime::instance()->PcSamplingStop(handle); + CATCH; +} + +hsa_status_t hsa_ven_amd_pcs_flush(hsa_ven_amd_pcs_t handle) { + TRY; + return PcsRuntime::instance()->PcSamplingFlush(handle); + CATCH; +} + void LoadPcSampling(core::PcSamplingExtTableInternal* pcs_api) { pcs_api->hsa_ven_amd_pcs_iterate_configuration_fn = hsa_ven_amd_pcs_iterate_configuration; pcs_api->hsa_ven_amd_pcs_create_fn = hsa_ven_amd_pcs_create; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/pcs/inc/hsa_ven_amd_pc_sampling_impl.h b/projects/rocr-runtime/runtime/hsa-runtime/pcs/inc/hsa_ven_amd_pc_sampling_impl.h index 11cffde799..58ed4d4371 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/pcs/inc/hsa_ven_amd_pc_sampling_impl.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/pcs/inc/hsa_ven_amd_pc_sampling_impl.h @@ -73,6 +73,12 @@ hsa_status_t hsa_ven_amd_pcs_create_from_id( hsa_status_t hsa_ven_amd_pcs_destroy(hsa_ven_amd_pcs_t pc_sampling); +hsa_status_t hsa_ven_amd_pcs_start(hsa_ven_amd_pcs_t pc_sampling); + +hsa_status_t hsa_ven_amd_pcs_stop(hsa_ven_amd_pcs_t pc_sampling); + +hsa_status_t hsa_ven_amd_pcs_flush(hsa_ven_amd_pcs_t pc_sampling); + // Update Api table with func pointers that implement functionality void LoadPcSampling(core::PcSamplingExtTableInternal* pcs_api); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.cpp b/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.cpp index 9fa56ea8b7..b0f57ca13e 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.cpp @@ -104,7 +104,7 @@ PcsRuntime::PcSamplingSession::PcSamplingSession( core::Agent* _agent, hsa_ven_amd_pcs_method_kind_t method, hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, size_t buffer_size, hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data) - : agent(_agent), thunkId_(0), valid_(true), sample_size_(0) { + : agent(_agent), thunkId_(0), active_(false), valid_(true), sample_size_(0) { switch (method) { case HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1: sample_size_ = sizeof(perf_sample_hosttrap_v1_t); @@ -240,6 +240,41 @@ hsa_status_t PcsRuntime::PcSamplingDestroy(hsa_ven_amd_pcs_t handle) { return ret; } +hsa_status_t PcsRuntime::PcSamplingStart(hsa_ven_amd_pcs_t handle) { + ScopedAcquire lock(&pc_sampling_lock_); + auto pcSamplingSessionIt = pc_sampling_.find(reinterpret_cast(handle.handle)); + if (pcSamplingSessionIt == pc_sampling_.end()) { + debug_warning(false && "Cannot find PcSampling session"); + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + AMD::GpuAgentInt* gpu_agent = static_cast(pcSamplingSessionIt->second.agent); + + return gpu_agent->PcSamplingStart(pcSamplingSessionIt->second); +} + +hsa_status_t PcsRuntime::PcSamplingStop(hsa_ven_amd_pcs_t handle) { + ScopedAcquire lock(&pc_sampling_lock_); + auto pcSamplingSessionIt = pc_sampling_.find(reinterpret_cast(handle.handle)); + if (pcSamplingSessionIt == pc_sampling_.end()) { + debug_warning(false && "Cannot find PcSampling session"); + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + AMD::GpuAgentInt* gpu_agent = static_cast(pcSamplingSessionIt->second.agent); + + return gpu_agent->PcSamplingStop(pcSamplingSessionIt->second); +} + +hsa_status_t PcsRuntime::PcSamplingFlush(hsa_ven_amd_pcs_t handle) { + ScopedAcquire lock(&pc_sampling_lock_); + auto pcSamplingSessionIt = pc_sampling_.find(reinterpret_cast(handle.handle)); + if (pcSamplingSessionIt == pc_sampling_.end()) { + debug_warning(false && "Cannot find PcSampling session"); + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + AMD::GpuAgentInt* gpu_agent = static_cast(pcSamplingSessionIt->second.agent); + + return gpu_agent->PcSamplingFlush(pcSamplingSessionIt->second); +} } // namespace pcs } // namespace rocr diff --git a/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.h b/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.h index 606d05e950..3fbac4650e 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.h @@ -70,7 +70,7 @@ class PcsRuntime { class PcSamplingSession { public: - PcSamplingSession() : agent(NULL), thunkId_(0){}; + PcSamplingSession() : agent(NULL), thunkId_(0), active_(false){}; PcSamplingSession(core::Agent* agent, hsa_ven_amd_pcs_method_kind_t method, hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, size_t buffer_size, hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, @@ -88,11 +88,15 @@ class PcsRuntime { core::Agent* agent; void SetThunkId(HsaPcSamplingTraceId thunkId) { thunkId_ = thunkId; } HsaPcSamplingTraceId ThunkId() { return thunkId_; } + bool isActive() { return active_; } + void start() { active_ = true; } + void stop() { active_ = false; } private: HsaPcSamplingTraceId thunkId_; - bool valid_; // Whether configuration parameters are valid + bool active_; // Set to true when the session is started + bool valid_; // Whether configuration parameters are valid size_t sample_size_; struct client_session_data_t { @@ -126,6 +130,9 @@ class PcsRuntime { void* client_cb_data, hsa_ven_amd_pcs_t* handle); hsa_status_t PcSamplingDestroy(hsa_ven_amd_pcs_t handle); + hsa_status_t PcSamplingStart(hsa_ven_amd_pcs_t handle); + hsa_status_t PcSamplingStop(hsa_ven_amd_pcs_t handle); + hsa_status_t PcSamplingFlush(hsa_ven_amd_pcs_t handle); private: /// @brief Initialize singleton object, must be called once.