diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index 36c3fbd6bc..a9902416d3 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -192,6 +192,13 @@ class GpuAgentInt : public core::Agent { // @retval HSA_STATUS_SUCCESS if successful virtual hsa_status_t PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configuration_callback_t cb, void* cb_data) = 0; + + virtual hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session) = 0; + + virtual hsa_status_t PcSamplingCreateFromId(HsaPcSamplingTraceId pcsId, + pcs::PcsRuntime::PcSamplingSession& session) = 0; + + virtual hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session) = 0; }; class GpuAgent : public GpuAgentInt { @@ -474,6 +481,10 @@ class GpuAgent : public GpuAgentInt { hsa_status_t PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configuration_callback_t cb, void* cb_data); + hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session); + hsa_status_t PcSamplingCreateFromId(HsaPcSamplingTraceId pcsId, + pcs::PcsRuntime::PcSamplingSession& session); + hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session); // @brief Node properties. const HsaNodeProperties properties_; @@ -677,6 +688,15 @@ class GpuAgent : public GpuAgentInt { std::function finegrain_allocator_; std::function finegrain_deallocator_; + + /* PC Sampling fields - begin */ + typedef struct { + pcs::PcsRuntime::PcSamplingSession* session; + } pcs_hosttrap_t; + + pcs_hosttrap_t pcs_hosttrap_data_; + /* PC Sampling fields - end */ + // @brief device handle amdgpu_device_handle ldrm_dev_; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index bef9dd58d7..77c2f0444d 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -113,7 +113,8 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna sdma_blit_used_mask_(0), scratch_limit_async_threshold_(0), scratch_cache_( - [this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }) { + [this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }), + pcs_hosttrap_data_() { const bool is_apu_node = (properties_.NumCPUCores > 0); profile_ = (is_apu_node) ? HSA_PROFILE_FULL : HSA_PROFILE_BASE; @@ -2362,6 +2363,49 @@ hsa_status_t GpuAgent::PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configura return HSA_STATUS_SUCCESS; } +hsa_status_t GpuAgent::PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session) { + HsaPcSamplingInfo sampleInfo = {}; + HsaPcSamplingTraceId thunkId; + + // IOCTL id does not exist at the moment, so passing 0 is OK, + // since it will be overridden later in this function. + hsa_status_t ret = PcSamplingCreateFromId(0, session); + if (ret != HSA_STATUS_SUCCESS) return ret; + + session.GetHsaKmtSamplingInfo(&sampleInfo); + HSAKMT_STATUS retkmt = hsaKmtPcSamplingCreate(node_id(), &sampleInfo, &thunkId); + if (retkmt != HSAKMT_STATUS_SUCCESS) { + return (retkmt == HSAKMT_STATUS_KERNEL_ALREADY_OPENED) ? (hsa_status_t)HSA_STATUS_ERROR_RESOURCE_BUSY + : HSA_STATUS_ERROR; + } + + debug_print("Created PC sampling session with thunkId:%d\n", thunkId); + + session.SetThunkId(thunkId); + + return ret; +} + +hsa_status_t GpuAgent::PcSamplingCreateFromId(HsaPcSamplingTraceId ioctlId, + pcs::PcsRuntime::PcSamplingSession& session) { + pcs_hosttrap_t& ht_data = pcs_hosttrap_data_; + + if (session.method() == HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1 && ht_data.session) + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + + session.SetThunkId(ioctlId); + ht_data.session = &session; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t GpuAgent::PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session) { + pcs_hosttrap_t& ht_data = pcs_hosttrap_data_; + HSAKMT_STATUS retKmt = hsaKmtPcSamplingDestroy(node_id(), session.ThunkId()); + ht_data.session = NULL; + + return (retKmt == HSAKMT_STATUS_SUCCESS) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; +} } // namespace amd } // namespace rocr diff --git a/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def b/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def index 953df6e4cc..24fa990b28 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def +++ b/projects/rocr-runtime/runtime/hsa-runtime/hsacore.so.def @@ -253,6 +253,9 @@ global: hsa_tools_scratch_event_async_reclaim_start; hsa_tools_scratch_event_async_reclaim_end; hsa_ven_amd_pcs_iterate_configuration; + hsa_ven_amd_pcs_create; + hsa_ven_amd_pcs_create_from_id; + hsa_ven_amd_pcs_destroy; local: *; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/pcs/hsa_ven_amd_pc_sampling.cpp b/projects/rocr-runtime/runtime/hsa-runtime/pcs/hsa_ven_amd_pc_sampling.cpp index 33bd0f6c50..3cc2fb86e5 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/pcs/hsa_ven_amd_pc_sampling.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/pcs/hsa_ven_amd_pc_sampling.cpp @@ -94,10 +94,50 @@ hsa_status_t hsa_ven_amd_pcs_iterate_configuration( CATCH; } +hsa_status_t hsa_ven_amd_pcs_create(hsa_agent_t hsa_agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, + size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb, + void* client_cb_data, hsa_ven_amd_pcs_t* handle) { + TRY; + IS_OPEN(); + core::Agent* agent = core::Agent::Convert(hsa_agent); + if (agent == NULL || !agent->IsValid() || agent->device_type() != core::Agent::kAmdGpuDevice) + return HSA_STATUS_ERROR_INVALID_AGENT; + + return PcsRuntime::instance()->PcSamplingCreate( + agent, method, units, interval, latency, buffer_size, data_ready_cb, client_cb_data, handle); + CATCH; +} + +hsa_status_t hsa_ven_amd_pcs_create_from_id(uint32_t pcs_id, hsa_agent_t hsa_agent, + hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, + size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb, + void* client_cb_data, hsa_ven_amd_pcs_t* handle) { + TRY; + IS_OPEN(); + core::Agent* agent = core::Agent::Convert(hsa_agent); + if (agent == NULL || !agent->IsValid() || agent->device_type() != core::Agent::kAmdGpuDevice) + return HSA_STATUS_ERROR_INVALID_AGENT; + + return PcsRuntime::instance()->PcSamplingCreateFromId(pcs_id, agent, method, units, interval, + latency, buffer_size, data_ready_cb, + client_cb_data, handle); + CATCH; +} + +hsa_status_t hsa_ven_amd_pcs_destroy(hsa_ven_amd_pcs_t handle) { + TRY; + return PcsRuntime::instance()->PcSamplingDestroy(handle); + CATCH; +} void LoadPcSampling(core::PcSamplingExtTableInternal* pcs_api) { pcs_api->hsa_ven_amd_pcs_iterate_configuration_fn = hsa_ven_amd_pcs_iterate_configuration; pcs_api->hsa_ven_amd_pcs_create_fn = hsa_ven_amd_pcs_create; + pcs_api->hsa_ven_amd_pcs_create_from_id_fn = hsa_ven_amd_pcs_create_from_id; pcs_api->hsa_ven_amd_pcs_destroy_fn = hsa_ven_amd_pcs_destroy; pcs_api->hsa_ven_amd_pcs_start_fn = hsa_ven_amd_pcs_start; pcs_api->hsa_ven_amd_pcs_stop_fn = hsa_ven_amd_pcs_stop; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/pcs/inc/hsa_ven_amd_pc_sampling_impl.h b/projects/rocr-runtime/runtime/hsa-runtime/pcs/inc/hsa_ven_amd_pc_sampling_impl.h index 23d1acddf7..11cffde799 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/pcs/inc/hsa_ven_amd_pc_sampling_impl.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/pcs/inc/hsa_ven_amd_pc_sampling_impl.h @@ -59,6 +59,20 @@ hsa_status_t hsa_ven_amd_pcs_iterate_configuration( hsa_agent_t agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback, void* callback_data); +hsa_status_t hsa_ven_amd_pcs_create(hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, + size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, + void* client_callback_data, hsa_ven_amd_pcs_t* pc_sampling); + +hsa_status_t hsa_ven_amd_pcs_create_from_id( + uint32_t pcs_id, hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data, + hsa_ven_amd_pcs_t* pc_sampling); + +hsa_status_t hsa_ven_amd_pcs_destroy(hsa_ven_amd_pcs_t pc_sampling); + // Update Api table with func pointers that implement functionality void LoadPcSampling(core::PcSamplingExtTableInternal* pcs_api); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.cpp b/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.cpp index 0d2c49aeb7..9fa56ea8b7 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.cpp @@ -52,6 +52,11 @@ namespace rocr { namespace pcs { +#define IS_BAD_PTR(ptr) \ +do { \ + if ((ptr) == NULL) return HSA_STATUS_ERROR_INVALID_ARGUMENT; \ +} while (false) + std::atomic PcsRuntime::instance_(NULL); std::mutex PcsRuntime::instance_mutex_; @@ -94,6 +99,66 @@ void PcsRuntime::DestroySingleton() { } void ReleasePcSamplingRsrcs() { PcsRuntime::DestroySingleton(); } + +PcsRuntime::PcSamplingSession::PcSamplingSession( + core::Agent* _agent, hsa_ven_amd_pcs_method_kind_t method, hsa_ven_amd_pcs_units_t units, + size_t interval, size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data) + : agent(_agent), thunkId_(0), valid_(true), sample_size_(0) { + switch (method) { + case HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1: + sample_size_ = sizeof(perf_sample_hosttrap_v1_t); + break; + case HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1: + sample_size_ = sizeof(perf_sample_snapshot_v1_t); + break; + default: + valid_ = false; + return; + } + + if (!interval || !buffer_size || (buffer_size % (2 * sample_size_))) { + valid_ = false; + return; + } + + csd.method = method; + csd.units = units; + csd.interval = interval; + csd.latency = latency; + csd.buffer_size = buffer_size; + csd.data_ready_callback = data_ready_callback; + csd.client_callback_data = client_callback_data; +} + +void PcsRuntime::PcSamplingSession::GetHsaKmtSamplingInfo(HsaPcSamplingInfo* sampleInfo) { + sampleInfo->value_min = 0; + sampleInfo->value_max = 0; + sampleInfo->flags = 0; + sampleInfo->value = csd.interval; + + switch (csd.method) { + case HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1: + sampleInfo->method = HSA_PC_SAMPLING_METHOD_KIND_HOSTTRAP_V1; + break; + case HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1: + sampleInfo->method = HSA_PC_SAMPLING_METHOD_KIND_STOCHASTIC_V1; + break; + } + + switch (csd.units) { + case HSA_VEN_AMD_PCS_INTERVAL_UNITS_MICRO_SECONDS: + sampleInfo->units = HSA_PC_SAMPLING_UNIT_INTERVAL_MICROSECONDS; + break; + case HSA_VEN_AMD_PCS_INTERVAL_UNITS_CLOCK_CYCLES: + sampleInfo->units = HSA_PC_SAMPLING_UNIT_INTERVAL_CYCLES; + break; + case HSA_VEN_AMD_PCS_INTERVAL_UNITS_INSTRUCTIONS: + sampleInfo->units = HSA_PC_SAMPLING_UNIT_INTERVAL_INSTRUCTIONS; + break; + } +} + hsa_status_t PcsRuntime::PcSamplingIterateConfig( core::Agent* agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback, void* callback_data) { @@ -101,6 +166,80 @@ hsa_status_t PcsRuntime::PcSamplingIterateConfig( return gpu_agent->PcSamplingIterateConfig(configuration_callback, callback_data); } +hsa_status_t PcsRuntime::PcSamplingCreate(core::Agent* agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, + size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb, + void* client_cb_data, hsa_ven_amd_pcs_t* handle) { + + IS_BAD_PTR(handle); + IS_BAD_PTR(data_ready_cb); + + return PcSamplingCreateInternal( + agent, method, units, interval, latency, buffer_size, data_ready_cb, client_cb_data, handle, + [](core::Agent* agent_, PcSamplingSession& session_) { + return static_cast(agent_)->PcSamplingCreate(session_); + }); +} + +hsa_status_t PcsRuntime::PcSamplingCreateFromId(uint32_t ioctl_pcs_id, core::Agent* agent, + hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, + size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb, + void* client_cb_data, hsa_ven_amd_pcs_t* handle) { + IS_BAD_PTR(handle); + IS_BAD_PTR(data_ready_cb); + + return PcSamplingCreateInternal( + agent, method, units, interval, latency, buffer_size, data_ready_cb, client_cb_data, handle, + [&](core::Agent* agent_, PcSamplingSession& session_) { + return static_cast(agent_)->PcSamplingCreateFromId(ioctl_pcs_id, + session_); + }); +} + +hsa_status_t PcsRuntime::PcSamplingCreateInternal( + core::Agent* agent, hsa_ven_amd_pcs_method_kind_t method, hsa_ven_amd_pcs_units_t units, + size_t interval, size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb, void* client_cb_data, + hsa_ven_amd_pcs_t* handle, agent_pcs_create_fn_t agent_pcs_create_fn) { + ScopedAcquire lock(&pc_sampling_lock_); + + handle->handle = ++pc_sampling_id_; + // create a new PcSamplingSession(agent, method, units, interval, latency, buffer_size, + // data_ready_cb, client_cb_data) reference and insert into pc_sampling_ + pc_sampling_.emplace(std::piecewise_construct, std::forward_as_tuple(handle->handle), + std::forward_as_tuple(agent, method, units, interval, latency, buffer_size, + data_ready_cb, client_cb_data)); + + if (!pc_sampling_[handle->handle].isValid()) { + pc_sampling_.erase(handle->handle); + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + hsa_status_t ret = agent_pcs_create_fn(agent, pc_sampling_[handle->handle]); + if (ret != HSA_STATUS_SUCCESS) { + pc_sampling_.erase(handle->handle); + return ret; + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t PcsRuntime::PcSamplingDestroy(hsa_ven_amd_pcs_t handle) { + ScopedAcquire lock(&pc_sampling_lock_); + auto pcSamplingSessionIt = pc_sampling_.find(reinterpret_cast(handle.handle)); + if (pcSamplingSessionIt == pc_sampling_.end()) { + debug_warning(false && "Cannot find PcSampling session"); + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + AMD::GpuAgentInt* gpu_agent = static_cast(pcSamplingSessionIt->second.agent); + + hsa_status_t ret = gpu_agent->PcSamplingDestroy(pcSamplingSessionIt->second); + pc_sampling_.erase(pcSamplingSessionIt); + return ret; +} + } // namespace pcs } // namespace rocr diff --git a/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.h b/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.h index 59af11b4a6..606d05e950 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/pcs/pcs_runtime.h @@ -59,7 +59,7 @@ namespace pcs { class PcsRuntime { public: - PcsRuntime() {} + PcsRuntime() : pc_sampling_id_(0) {} ~PcsRuntime() {} /// @brief Getter for the PcsRuntime singleton object. @@ -68,10 +68,65 @@ class PcsRuntime { /// @brief Destroy singleton object. static void DestroySingleton(); + class PcSamplingSession { + public: + PcSamplingSession() : agent(NULL), thunkId_(0){}; + PcSamplingSession(core::Agent* agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, + size_t buffer_size, hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, + void* client_callback_data); + ~PcSamplingSession(){}; + + const bool isValid() { return valid_; } + const size_t buffer_size() { return csd.buffer_size; } + const hsa_ven_amd_pcs_method_kind_t method() { return csd.method; } + const size_t latency() { return csd.latency; } + const size_t sample_size() { return sample_size_; } + + void GetHsaKmtSamplingInfo(HsaPcSamplingInfo* sampleInfo); + + core::Agent* agent; + void SetThunkId(HsaPcSamplingTraceId thunkId) { thunkId_ = thunkId; } + HsaPcSamplingTraceId ThunkId() { return thunkId_; } + + private: + HsaPcSamplingTraceId thunkId_; + + bool valid_; // Whether configuration parameters are valid + size_t sample_size_; + + struct client_session_data_t { + hsa_ven_amd_pcs_method_kind_t method; + hsa_ven_amd_pcs_units_t units; + size_t interval; + size_t latency; + size_t buffer_size; + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback; + void* client_callback_data; + }; + struct client_session_data_t csd; + }; // class PcSamplingSession + hsa_status_t PcSamplingIterateConfig( core::Agent* agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback, void* callback_data); + hsa_status_t PcSamplingCreate(core::Agent* agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, + size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb, + void* client_cb_data, hsa_ven_amd_pcs_t* handle); + + + hsa_status_t PcSamplingCreateFromId(uint32_t ioctl_pcs_id, core::Agent* agent, + hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, + size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb, + void* client_cb_data, hsa_ven_amd_pcs_t* handle); + + hsa_status_t PcSamplingDestroy(hsa_ven_amd_pcs_t handle); + private: /// @brief Initialize singleton object, must be called once. static PcsRuntime* CreateSingleton(); @@ -80,7 +135,20 @@ class PcsRuntime { static std::atomic instance_; static std::mutex instance_mutex_; + // Map of pc sampling sessions indexed by hsa_ven_amd_pcs_t handle + std::map pc_sampling_; + KernelMutex pc_sampling_lock_; + uint64_t pc_sampling_id_; + DISALLOW_COPY_AND_ASSIGN(PcsRuntime); + + using agent_pcs_create_fn_t = std::function; + hsa_status_t PcSamplingCreateInternal(core::Agent* agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, + size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_cb, + void* client_cb_data, hsa_ven_amd_pcs_t* handle, + agent_pcs_create_fn_t agent_pcs_create_fn); }; } // namespace pcs