From b278cd379baadf8836758112d86f7760262e8e7f Mon Sep 17 00:00:00 2001 From: Chris Freehill Date: Wed, 11 Nov 2020 14:41:27 -0600 Subject: [PATCH] Add event notification support and rdci timestamps Also: * print header line every 50 line on output * print events that are being listened for with header * cpplint clean-up Change-Id: Ic049eb79156a9528b556e56f0fa43e1344f898cc --- .../{rdc_field_data.data => rdc_field.data} | 5 + common/rdc_fields_supported.cc | 4 +- include/rdc/rdc.h | 29 ++- include/rdc_lib/RdcModuleMgr.h | 6 +- include/rdc_lib/RdcNotification.h | 62 ++++++ include/rdc_lib/RdcTelemetry.h | 6 +- include/rdc_lib/RdcTelemetryLibInterface.h | 6 +- include/rdc_lib/RdcWatchTable.h | 1 + include/rdc_lib/impl/RdcCacheManagerImpl.h | 6 +- include/rdc_lib/impl/RdcEmbeddedHandler.h | 3 +- include/rdc_lib/impl/RdcMetricsUpdaterImpl.h | 1 + include/rdc_lib/impl/RdcModuleMgrImpl.h | 7 +- include/rdc_lib/impl/RdcNotificationImpl.h | 61 ++++++ include/rdc_lib/impl/RdcRasLib.h | 6 +- include/rdc_lib/impl/RdcSmiLib.h | 6 +- include/rdc_lib/impl/RdcTelemetryModule.h | 6 +- include/rdc_lib/impl/RdcWatchTableImpl.h | 10 +- include/rdc_lib/impl/RsmiUtils.h | 37 ++++ python_binding/rdc_bootstrap.py | 4 + rdc_libs/CMakeLists.txt | 6 + rdc_libs/rdc/src/RdcCacheManagerImpl.cc | 21 +- rdc_libs/rdc/src/RdcEmbeddedHandler.cc | 7 +- rdc_libs/rdc/src/RdcMetricFetcherImpl.cc | 43 +---- rdc_libs/rdc/src/RdcMetricsUpdaterImpl.cc | 9 + rdc_libs/rdc/src/RdcNotificationImpl.cc | 182 ++++++++++++++++++ rdc_libs/rdc/src/RdcWatchTableImpl.cc | 74 ++++++- rdc_libs/rdc/src/RsmiUtils.cc | 73 +++++++ rdci/include/RdciDmonSubSystem.h | 3 +- rdci/src/RdciDmonSubSystem.cc | 182 ++++++++++++++++-- server/src/rdc_api_service.cc | 5 +- server/src/rdc_server_main.cc | 1 - tests/rdc_tests/test_base.cc | 3 +- 32 files changed, 766 insertions(+), 109 deletions(-) rename common/{rdc_field_data.data => rdc_field.data} (90%) create mode 100644 include/rdc_lib/RdcNotification.h create mode 100644 include/rdc_lib/impl/RdcNotificationImpl.h create mode 100644 include/rdc_lib/impl/RsmiUtils.h create mode 100644 rdc_libs/rdc/src/RdcNotificationImpl.cc create mode 100644 rdc_libs/rdc/src/RsmiUtils.cc diff --git a/common/rdc_field_data.data b/common/rdc_field.data similarity index 90% rename from common/rdc_field_data.data rename to common/rdc_field.data index 6c389fea6d..b2850eec2f 100644 --- a/common/rdc_field_data.data +++ b/common/rdc_field.data @@ -57,3 +57,8 @@ FLD_DESC_ENT(RDC_EVNT_XGMI_1_BEATS_TX, "Data sent to neighbor 1 (32 byte pkts) FLD_DESC_ENT(RDC_EVNT_XGMI_0_THRPUT, "Tx throughput to XGMI neighbor 0 in b/s", "XGMI_0_T", true) FLD_DESC_ENT(RDC_EVNT_XGMI_1_THRPUT, "Tx throughput to XGMI neighbor 1 in b/s", "XGMI_1_T", true) +// Asynchronous event notifications +FLD_DESC_ENT(RDC_EVNT_NOTIF_VMFAULT, "VM page fault", "VM_PAGE_FAULT", false) +FLD_DESC_ENT(RDC_EVNT_NOTIF_THERMAL_THROTTLE, "Clk freq decrease due to temp", "THERMAL_THROT", false) +FLD_DESC_ENT(RDC_EVNT_NOTIF_PRE_RESET, "GPU reset is about to occur", "GPU_PRE_RESET", false) +FLD_DESC_ENT(RDC_EVNT_NOTIF_POST_RESET, "GPU reset just occurred", "GPU_POST_RESET", false) diff --git a/common/rdc_fields_supported.cc b/common/rdc_fields_supported.cc index 7d1ca7226d..4a2dc98de1 100644 --- a/common/rdc_fields_supported.cc +++ b/common/rdc_fields_supported.cc @@ -31,13 +31,13 @@ namespace rdc { #define FLD_DESC_ENT(ID, DESC, LABEL, DISPLAY) \ {static_cast(ID), {#ID, (DESC), (LABEL), (DISPLAY)}}, static const fld_id2name_map_t field_id_to_descript = { - #include "common/rdc_field_data.data" + #include "common/rdc_field.data" }; #undef FLD_DESC_ENT #define FLD_DESC_ENT(ID, DESC, LABEL, DISPLAY) {#ID, (ID)}, static fld_name2id_map_t field_name_to_id = { - #include "common/rdc_field_data.data" // NOLINT + #include "common/rdc_field.data" // NOLINT }; #undef FLD_DESC_ENT diff --git a/include/rdc/rdc.h b/include/rdc/rdc.h index db70734789..254d1d6520 100755 --- a/include/rdc/rdc.h +++ b/include/rdc/rdc.h @@ -219,8 +219,19 @@ typedef enum { //!< neighbor 0 in byes/sec RDC_EVNT_XGMI_1_THRPUT, //!< Transmit throughput to XGMI //!< neighbor 1 in byes/sec -} rdc_field_t; + RDC_EVNT_NOTIF_VMFAULT = 2000, //!< VM page fault + RDC_EVNT_NOTIF_FIRST = RDC_EVNT_NOTIF_VMFAULT, + + RDC_EVNT_NOTIF_THERMAL_THROTTLE, //!< Clock frequency has decreased + //!< due to temperature rise + RDC_EVNT_NOTIF_PRE_RESET, //!< GPU reset is about to occur + RDC_EVNT_NOTIF_POST_RESET, //!< GPU reset just occurred + + RDC_EVNT_NOTIF_LAST = RDC_EVNT_NOTIF_POST_RESET, +} rdc_field_t; +#define RDC_EVNT_IS_NOTIF_FIELD(FIELD) \ + ((FIELD) >= RDC_EVNT_NOTIF_FIRST && (FIELD) <= RDC_EVNT_NOTIF_LAST) /** * @brief handlers used in various rdc calls */ @@ -290,6 +301,15 @@ typedef struct { rdc_gpu_usage_info_t gpus[16]; //!< Job usage summary staticstics by GPU } rdc_job_info_t; +/** + * @brief Field value data + */ +typedef union { + int64_t l_int; + double dbl; + char str[RDC_MAX_STR_LENGTH]; +} rdc_field_value_data; + /** * @brief The structure to store the field value */ @@ -298,11 +318,7 @@ typedef struct { int status; //!< RDC_ST_OK or error status uint64_t ts; //!< Timestamp in usec since 1970 rdc_field_type_t type; //!< The field type - union { - int64_t l_int; - double dbl; - char str[RDC_MAX_STR_LENGTH]; - } value; //!< Value of the field. Value type + rdc_field_value_data value; //!< Value of the field. Value type //!< depends on the field type. } rdc_field_value; @@ -328,7 +344,6 @@ typedef struct { uint64_t stop_time; //!< job stop time } rdc_job_group_info_t; - /** * @brief Initialize ROCm RDC. * diff --git a/include/rdc_lib/RdcModuleMgr.h b/include/rdc_lib/RdcModuleMgr.h index bf5b23aa4f..ac71057fde 100644 --- a/include/rdc_lib/RdcModuleMgr.h +++ b/include/rdc_lib/RdcModuleMgr.h @@ -19,8 +19,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef RDC_LIB_RDCMODULEMGR_H_ -#define RDC_LIB_RDCMODULEMGR_H_ +#ifndef INCLUDE_RDC_LIB_RDCMODULEMGR_H_ +#define INCLUDE_RDC_LIB_RDCMODULEMGR_H_ #include #include "rdc_lib/rdc_common.h" @@ -41,4 +41,4 @@ typedef std::shared_ptr RdcModuleMgrPtr; } // namespace amd -#endif // RDC_LIB_RDCMODULEMGR_H_ +#endif // INCLUDE_RDC_LIB_RDCMODULEMGR_H_ diff --git a/include/rdc_lib/RdcNotification.h b/include/rdc_lib/RdcNotification.h new file mode 100644 index 0000000000..90f5dc912c --- /dev/null +++ b/include/rdc_lib/RdcNotification.h @@ -0,0 +1,62 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef INCLUDE_RDC_LIB_RDCNOTIFICATION_H_ +#define INCLUDE_RDC_LIB_RDCNOTIFICATION_H_ + +#include +#include +#include "rdc_lib/rdc_common.h" +#include "rdc/rdc.h" + +namespace amd { +namespace rdc { + +extern const uint32_t kMaxRSMIEvents; + +typedef struct { + uint32_t gpu_id; + rdc_field_value field; +} rdc_evnt_notification_t; + +class RdcNotification { + public: + virtual bool is_notification_event(rdc_field_t field) const = 0; + + virtual rdc_status_t + set_listen_events(const std::vector fk_arr) = 0; + + // Blocking + virtual rdc_status_t + listen(rdc_evnt_notification_t *events, uint32_t *num_events, + uint32_t timeout_ms) = 0; + + virtual rdc_status_t stop_listening(uint32_t gpu_id) = 0; + virtual ~RdcNotification() {} +}; + +typedef std::shared_ptr RdcNotificationPtr; + +} // namespace rdc +} // namespace amd + +#endif // INCLUDE_RDC_LIB_RDCNOTIFICATION_H_ + diff --git a/include/rdc_lib/RdcTelemetry.h b/include/rdc_lib/RdcTelemetry.h index 1c6bf7e759..16717beb36 100644 --- a/include/rdc_lib/RdcTelemetry.h +++ b/include/rdc_lib/RdcTelemetry.h @@ -19,8 +19,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef RDC_LIB_RDCTELEMETRY_H_ -#define RDC_LIB_RDCTELEMETRY_H_ +#ifndef INCLUDE_RDC_LIB_RDCTELEMETRY_H_ +#define INCLUDE_RDC_LIB_RDCTELEMETRY_H_ #include #include "rdc/rdc.h" @@ -54,4 +54,4 @@ typedef std::shared_ptr RdcTelemetryPtr; } // namespace amd -#endif // RDC_LIB_RDCTELEMETRY_H_ +#endif // INCLUDE_RDC_LIB_RDCTELEMETRY_H_ diff --git a/include/rdc_lib/RdcTelemetryLibInterface.h b/include/rdc_lib/RdcTelemetryLibInterface.h index 97194ed6ed..939a8628fa 100644 --- a/include/rdc_lib/RdcTelemetryLibInterface.h +++ b/include/rdc_lib/RdcTelemetryLibInterface.h @@ -19,8 +19,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef RDC_LIB_RDCTELEMETRYLIBINTERFACE_H_ -#define RDC_LIB_RDCTELEMETRYLIBINTERFACE_H_ +#ifndef INCLUDE_RDC_LIB_RDCTELEMETRYLIBINTERFACE_H_ +#define INCLUDE_RDC_LIB_RDCTELEMETRYLIBINTERFACE_H_ // The telemetry interface for libraries, for example, RAS. #include @@ -65,4 +65,4 @@ rdc_status_t rdc_module_destroy(); } -#endif // RDC_LIB_RDCTELEMETRYLIBINTERFACE_H_ +#endif // INCLUDE_RDC_LIB_RDCTELEMETRYLIBINTERFACE_H_ diff --git a/include/rdc_lib/RdcWatchTable.h b/include/rdc_lib/RdcWatchTable.h index ecb6f0823f..44abfcf374 100644 --- a/include/rdc_lib/RdcWatchTable.h +++ b/include/rdc_lib/RdcWatchTable.h @@ -34,6 +34,7 @@ namespace rdc { class RdcWatchTable { public: virtual rdc_status_t rdc_field_update_all() = 0; + virtual rdc_status_t rdc_field_listen_notif(uint32_t timeout_ms) = 0; virtual rdc_status_t rdc_job_start_stats(rdc_gpu_group_t group_id, const char job_id[64], uint64_t update_freq, diff --git a/include/rdc_lib/impl/RdcCacheManagerImpl.h b/include/rdc_lib/impl/RdcCacheManagerImpl.h index 9ef38b9502..a83d78ef5b 100644 --- a/include/rdc_lib/impl/RdcCacheManagerImpl.h +++ b/include/rdc_lib/impl/RdcCacheManagerImpl.h @@ -35,9 +35,13 @@ THE SOFTWARE. namespace amd { namespace rdc { +// Note, the .cc code relies on RdcCacheEntry only having plain-old-data +// types and arrays (no pointers). If a pointer is added, make sure to update +// any code that copies this structure. struct RdcCacheEntry { uint64_t last_time; - int64_t value; + rdc_field_type_t type; + rdc_field_value_data value; }; typedef std::map> RdcCacheSamples; diff --git a/include/rdc_lib/impl/RdcEmbeddedHandler.h b/include/rdc_lib/impl/RdcEmbeddedHandler.h index 2ba6aaf808..8fa54e4723 100644 --- a/include/rdc_lib/impl/RdcEmbeddedHandler.h +++ b/include/rdc_lib/impl/RdcEmbeddedHandler.h @@ -30,6 +30,7 @@ THE SOFTWARE. #include "rdc_lib/RdcMetricsUpdater.h" #include "rdc_lib/RdcWatchTable.h" #include "rdc_lib/RdcModuleMgr.h" +#include "rdc_lib/RdcNotification.h" namespace amd { namespace rdc { @@ -85,7 +86,6 @@ class RdcEmbeddedHandler: public RdcHandler { uint64_t *next_since_time_stamp, rdc_field_value* value) override; rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) override; - // Control API rdc_status_t rdc_field_update_all(uint32_t wait_for_update) override; @@ -98,6 +98,7 @@ class RdcEmbeddedHandler: public RdcHandler { RdcCacheManagerPtr cache_mgr_; RdcMetricFetcherPtr metric_fetcher_; RdcModuleMgrPtr rdc_module_mgr_; + RdcNotificationPtr rdc_notif_; RdcWatchTablePtr watch_table_; RdcMetricsUpdaterPtr metrics_updater_; std::future updater_; diff --git a/include/rdc_lib/impl/RdcMetricsUpdaterImpl.h b/include/rdc_lib/impl/RdcMetricsUpdaterImpl.h index 72fd9d1661..c37e1f6d5d 100644 --- a/include/rdc_lib/impl/RdcMetricsUpdaterImpl.h +++ b/include/rdc_lib/impl/RdcMetricsUpdaterImpl.h @@ -40,6 +40,7 @@ class RdcMetricsUpdaterImpl: public RdcMetricsUpdater { RdcWatchTablePtr watch_table_; std::atomic started_; std::future updater_; // keep the future of updater + std::future notif_updater_; // keep the future of notif updater const uint32_t _check_frequency; // Check frequency in milliseconds }; diff --git a/include/rdc_lib/impl/RdcModuleMgrImpl.h b/include/rdc_lib/impl/RdcModuleMgrImpl.h index 6e0706fdd8..e1467e5e0d 100644 --- a/include/rdc_lib/impl/RdcModuleMgrImpl.h +++ b/include/rdc_lib/impl/RdcModuleMgrImpl.h @@ -19,8 +19,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef RDC_LIB_IMPL_RDCMODULEMGRIMPL_H_ -#define RDC_LIB_IMPL_RDCMODULEMGRIMPL_H_ +#ifndef INCLUDE_RDC_LIB_IMPL_RDCMODULEMGRIMPL_H_ +#define INCLUDE_RDC_LIB_IMPL_RDCMODULEMGRIMPL_H_ #include #include "rdc_lib/RdcModuleMgr.h" @@ -43,11 +43,10 @@ class RdcModuleMgrImpl: public RdcModuleMgr { RdcRasLibPtr ras_lib_; RdcMetricFetcherPtr fetcher_; - }; } // namespace rdc } // namespace amd -#endif // RDC_LIB_IMPL_RDCMODULEMGRIMPL_H_ +#endif // INCLUDE_RDC_LIB_IMPL_RDCMODULEMGRIMPL_H_ diff --git a/include/rdc_lib/impl/RdcNotificationImpl.h b/include/rdc_lib/impl/RdcNotificationImpl.h new file mode 100644 index 0000000000..abd2f0bf47 --- /dev/null +++ b/include/rdc_lib/impl/RdcNotificationImpl.h @@ -0,0 +1,61 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef INCLUDE_RDC_LIB_IMPL_RDCNOTIFICATIONIMPL_H_ +#define INCLUDE_RDC_LIB_IMPL_RDCNOTIFICATIONIMPL_H_ + +#include +#include +#include +#include + +#include "rdc_lib/rdc_common.h" +#include "rdc_lib/RdcNotification.h" +#include "rdc/rdc.h" + + +namespace amd { +namespace rdc { + +class RdcNotificationImpl : public RdcNotification { + public: + RdcNotificationImpl(); + ~RdcNotificationImpl(); + + bool is_notification_event(rdc_field_t field) const override; + rdc_status_t set_listen_events( + const std::vector fk_arr) override; + // Blocking + rdc_status_t listen(rdc_evnt_notification_t *events, + uint32_t *num_events, uint32_t timeout_ms) override; + rdc_status_t stop_listening(uint32_t gpu_id) override; + + private: + std::map gpu_evnt_notif_masks_; + std::mutex notif_mutex_; +}; + + +} // namespace rdc +} // namespace amd + +#endif // INCLUDE_RDC_LIB_IMPL_RDCNOTIFICATIONIMPL_H_ + diff --git a/include/rdc_lib/impl/RdcRasLib.h b/include/rdc_lib/impl/RdcRasLib.h index 1308444302..ae8255e8e7 100644 --- a/include/rdc_lib/impl/RdcRasLib.h +++ b/include/rdc_lib/impl/RdcRasLib.h @@ -19,8 +19,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef RDC_LIB_IMPL_RDCRASLIB_H_ -#define RDC_LIB_IMPL_RDCRASLIB_H_ +#ifndef INCLUDE_RDC_LIB_IMPL_RDCRASLIB_H_ +#define INCLUDE_RDC_LIB_IMPL_RDCRASLIB_H_ #include #include @@ -74,4 +74,4 @@ typedef std::shared_ptr RdcRasLibPtr; } // namespace amd -#endif // RDC_LIB_IMPL_RDCRASLIB_H_ +#endif // INCLUDE_RDC_LIB_IMPL_RDCRASLIB_H_ diff --git a/include/rdc_lib/impl/RdcSmiLib.h b/include/rdc_lib/impl/RdcSmiLib.h index 88d7e4444c..5f311b6a2b 100644 --- a/include/rdc_lib/impl/RdcSmiLib.h +++ b/include/rdc_lib/impl/RdcSmiLib.h @@ -19,8 +19,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef RDC_LIB_IMPL_RDCSMILIB_H_ -#define RDC_LIB_IMPL_RDCSMILIB_H_ +#ifndef INCLUDE_RDC_LIB_IMPL_RDCSMILIB_H_ +#define INCLUDE_RDC_LIB_IMPL_RDCSMILIB_H_ #include #include @@ -55,4 +55,4 @@ class RdcSmiLib : public RdcTelemetry { } // namespace rdc } // namespace amd -#endif // RDC_LIB_IMPL_RDCSMILIB_H_ +#endif // INCLUDE_RDC_LIB_IMPL_RDCSMILIB_H_ diff --git a/include/rdc_lib/impl/RdcTelemetryModule.h b/include/rdc_lib/impl/RdcTelemetryModule.h index c0688cd5fc..62c588a374 100644 --- a/include/rdc_lib/impl/RdcTelemetryModule.h +++ b/include/rdc_lib/impl/RdcTelemetryModule.h @@ -19,8 +19,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef RDC_LIB_IMPL_RDCTELEMETRYMODULE_H_ -#define RDC_LIB_IMPL_RDCTELEMETRYMODULE_H_ +#ifndef INCLUDE_RDC_LIB_IMPL_RDCTELEMETRYMODULE_H_ +#define INCLUDE_RDC_LIB_IMPL_RDCTELEMETRYMODULE_H_ #include #include @@ -69,4 +69,4 @@ typedef std::shared_ptr RdcTelemetryModulePtr; } // namespace amd -#endif // RDC_LIB_IMPL_RDCTELEMETRYMODULE_H_ +#endif // INCLUDE_RDC_LIB_IMPL_RDCTELEMETRYMODULE_H_ diff --git a/include/rdc_lib/impl/RdcWatchTableImpl.h b/include/rdc_lib/impl/RdcWatchTableImpl.h index 6d95a0d7d3..297650195d 100644 --- a/include/rdc_lib/impl/RdcWatchTableImpl.h +++ b/include/rdc_lib/impl/RdcWatchTableImpl.h @@ -34,6 +34,7 @@ THE SOFTWARE. #include "rdc_lib/RdcCacheManager.h" #include "rdc_lib/RdcMetricFetcher.h" #include "rdc_lib/RdcModuleMgr.h" +#include "rdc_lib/RdcNotification.h" #include "rocm_smi/rocm_smi.h" namespace amd { @@ -82,10 +83,12 @@ class RdcWatchTableImpl : public RdcWatchTable { //!< is expensive. Internally, this function will throttle the cleanup to //!< once per second. rdc_status_t rdc_field_update_all() override; + rdc_status_t rdc_field_listen_notif(uint32_t timeout_ms) override; RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings, const RdcCacheManagerPtr& cache_mgr, - const RdcModuleMgrPtr& module_mgr); + const RdcModuleMgrPtr& module_mgr, + const RdcNotificationPtr& notif); private: //!< Helper function to Update the fields_in_table when unwatch tables @@ -106,8 +109,8 @@ class RdcWatchTableImpl : public RdcWatchTable { bool is_job_watch_field(uint32_t gpu_index, rdc_field_t field_id, std::string& job_id) const; // NOLINT - rdc_status_t initialize_rsmi_handles(RdcFieldKey fk); - + rdc_status_t rdc_notif_update_cache(rdc_evnt_notification_t *events, + uint32_t num_events); //!< The function will be pass as the callback for bulk fetch static rdc_status_t handle_fields(rdc_gpu_field_value_t* values, uint32_t num_values, void* user_data); @@ -115,6 +118,7 @@ class RdcWatchTableImpl : public RdcWatchTable { RdcGroupSettingsPtr group_settings_; RdcCacheManagerPtr cache_mgr_; RdcModuleMgrPtr rdc_module_mgr_; + RdcNotificationPtr notifications_; //!< The watch table to store the watch settings. std::map watch_table_; diff --git a/include/rdc_lib/impl/RsmiUtils.h b/include/rdc_lib/impl/RsmiUtils.h new file mode 100644 index 0000000000..a49bf63586 --- /dev/null +++ b/include/rdc_lib/impl/RsmiUtils.h @@ -0,0 +1,37 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef INCLUDE_RDC_LIB_IMPL_RSMIUTILS_H_ +#define INCLUDE_RDC_LIB_IMPL_RSMIUTILS_H_ + +#include "rocm_smi/rocm_smi.h" + +namespace amd { +namespace rdc { + +rdc_status_t Rsmi2RdcError(rsmi_status_t rsmi); + +} // namespace rdc +} // namespace amd + +#endif // INCLUDE_RDC_LIB_IMPL_RSMIUTILS_H_ + diff --git a/python_binding/rdc_bootstrap.py b/python_binding/rdc_bootstrap.py index 6a8da28229..c0c9caed21 100644 --- a/python_binding/rdc_bootstrap.py +++ b/python_binding/rdc_bootstrap.py @@ -77,6 +77,10 @@ class rdc_field_t(c_int): RDC_EVNT_XGMI_1_BEATS_TX = 1007 RDC_EVNT_XGMI_0_THRPUT = 1500 RDC_EVNT_XGMI_1_THRPUT = 1501 + RDC_EVNT_NOTIF_VMFAULT = 2000 + RDC_EVNT_NOTIF_THERMAL_THROTTLE = 2001 + RDC_EVNT_NOTIF_PRE_RESET = 2002 + RDC_EVNT_NOTIF_POST_RESET = 2003 rdc_handle_t = c_void_p rdc_gpu_group_t = c_uint32 diff --git a/rdc_libs/CMakeLists.txt b/rdc_libs/CMakeLists.txt index f4f7f82d80..860a88c881 100755 --- a/rdc_libs/CMakeLists.txt +++ b/rdc_libs/CMakeLists.txt @@ -125,6 +125,7 @@ message("BOOTSTRAP_LIB_INC_LIST=${BOOTSTRAP_LIB_INC_LIST}") add_library(${BOOTSTRAP_LIB} SHARED ${BOOTSTRAP_LIB_SRC_LIST} ${BOOTSTRAP_LIB_INC_LIST}) target_link_libraries(${BOOTSTRAP_LIB} pthread dl) target_include_directories(${BOOTSTRAP_LIB} PRIVATE + "${RSMI_INC_DIR}" "${PROJECT_SOURCE_DIR}" "${PROJECT_SOURCE_DIR}/include" "${COMMON_DIR}" @@ -150,6 +151,8 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcRasLib.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcSmiLib.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcTelemetryModule.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcModuleMgrImpl.cc") +set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcNotificationImpl.cc") +set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RsmiUtils.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${COMMON_DIR}/rdc_fields_supported.cc") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcEmbeddedHandler.h") @@ -169,6 +172,9 @@ set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcMod set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcModuleMgr.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcTelemetry.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcTelemetryModule.h") +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcNotification.h") +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcNotificationImpl.h") +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RsmiUtils.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${COMMON_DIR}/rdc_fields_supported.h") message("RDC_LIB_INC_LIST=${RDC_LIB_INC_LIST}") diff --git a/rdc_libs/rdc/src/RdcCacheManagerImpl.cc b/rdc_libs/rdc/src/RdcCacheManagerImpl.cc index 0bfb0ca1bc..7b3ea60b10 100644 --- a/rdc_libs/rdc/src/RdcCacheManagerImpl.cc +++ b/rdc_libs/rdc/src/RdcCacheManagerImpl.cc @@ -59,8 +59,14 @@ rdc_status_t RdcCacheManagerImpl::rdc_field_get_value_since( *next_since_time_stamp = cache_value->last_time + 1; } value->ts = cache_value->last_time; - value->type = INTEGER; - value->value.l_int = cache_value->value; + value->type = cache_value->type; + + if (value->type == STRING) { + strncpy_with_null(value->value.str, cache_value->value.str, + RDC_MAX_STR_LENGTH); + } else { + value->value.l_int = cache_value->value.l_int; + } value->field_id = field_id; return RDC_ST_OK; } @@ -123,8 +129,8 @@ rdc_status_t RdcCacheManagerImpl::rdc_field_get_latest_value( auto& cache_value = cache_samples_ite->second.back(); value->ts = cache_value.last_time; - value->type = INTEGER; - value->value.l_int = cache_value.value; + value->type = cache_value.type; + value->value = cache_value.value; value->field_id = field_id; return RDC_ST_OK; @@ -156,11 +162,8 @@ rdc_status_t RdcCacheManagerImpl::rdc_update_cache(uint32_t gpu_index, const rdc_field_value& value) { RdcCacheEntry entry; entry.last_time = value.ts; - if (value.type == INTEGER) { - entry.value = value.value.l_int; - } else { - return RDC_ST_NOT_SUPPORTED; - } + entry.value = value.value; + entry.type = value.type; std::lock_guard guard(cache_mutex_); RdcFieldKey field{gpu_index, value.field_id}; diff --git a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc index 202b2032aa..40b390e597 100644 --- a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc +++ b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc @@ -19,17 +19,19 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "rdc_lib/impl/RdcEmbeddedHandler.h" #include +#include "rdc_lib/impl/RdcEmbeddedHandler.h" #include "rdc_lib/impl/RdcMetricFetcherImpl.h" #include "rdc_lib/impl/RdcGroupSettingsImpl.h" #include "rdc_lib/impl/RdcMetricsUpdaterImpl.h" #include "rdc_lib/impl/RdcCacheManagerImpl.h" #include "rdc_lib/impl/RdcWatchTableImpl.h" #include "rdc_lib/impl/RdcModuleMgrImpl.h" +#include "rdc_lib/impl/RdcNotificationImpl.h" #include "rdc_lib/rdc_common.h" #include "rdc_lib/RdcLogger.h" #include "rdc_lib/RdcException.h" +#include "rdc_lib/RdcNotification.h" #include "common/rdc_fields_supported.h" #include "rocm_smi/rocm_smi.h" @@ -72,8 +74,9 @@ RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode): , cache_mgr_(new RdcCacheManagerImpl()) , metric_fetcher_(new RdcMetricFetcherImpl()) , rdc_module_mgr_(new RdcModuleMgrImpl(metric_fetcher_)) + , rdc_notif_(new RdcNotificationImpl()) , watch_table_(new RdcWatchTableImpl(group_settings_, - cache_mgr_, rdc_module_mgr_)) + cache_mgr_, rdc_module_mgr_, rdc_notif_)) , metrics_updater_(new RdcMetricsUpdaterImpl(watch_table_, METIC_UPDATE_FREQUENCY)) { if (mode == RDC_OPERATION_MODE_AUTO) { diff --git a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index 192452232c..8a13217b04 100644 --- a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -31,6 +31,7 @@ THE SOFTWARE. #include "common/rdc_fields_supported.h" #include "rdc_lib/RdcLogger.h" #include "rocm_smi/rocm_smi.h" +#include "rdc_lib/impl/RsmiUtils.h" namespace amd { namespace rdc { @@ -551,47 +552,5 @@ rdc_status_t RdcMetricFetcherImpl::acquire_rsmi_handle(RdcFieldKey fk) { return RDC_ST_OK; } -rdc_status_t Rsmi2RdcError(rsmi_status_t rsmi) { - switch (rsmi) { - case RSMI_STATUS_SUCCESS: - return RDC_ST_OK; - - case RSMI_STATUS_INVALID_ARGS: - return RDC_ST_BAD_PARAMETER; - - case RSMI_STATUS_NOT_SUPPORTED: - return RDC_ST_NOT_SUPPORTED; - - case RSMI_STATUS_NOT_FOUND: - return RDC_ST_NOT_FOUND; - - case RSMI_STATUS_OUT_OF_RESOURCES: - return RDC_ST_INSUFF_RESOURCES; - - case RSMI_STATUS_FILE_ERROR: - return RDC_ST_FILE_ERROR; - - case RSMI_STATUS_NO_DATA: - return RDC_ST_NO_DATA; - - case RSMI_STATUS_PERMISSION: - return RDC_ST_PERM_ERROR; - - case RSMI_STATUS_BUSY: - case RSMI_STATUS_UNKNOWN_ERROR: - case RSMI_STATUS_INTERNAL_EXCEPTION: - case RSMI_STATUS_INPUT_OUT_OF_BOUNDS: - case RSMI_STATUS_INIT_ERROR: - case RSMI_STATUS_NOT_YET_IMPLEMENTED: - case RSMI_STATUS_INSUFFICIENT_SIZE: - case RSMI_STATUS_INTERRUPT: - case RSMI_STATUS_UNEXPECTED_SIZE: - case RSMI_STATUS_UNEXPECTED_DATA: - case RSMI_STATUS_REFCOUNT_OVERFLOW: - default: - return RDC_ST_UNKNOWN_ERROR; - } -} - } // namespace rdc } // namespace amd diff --git a/rdc_libs/rdc/src/RdcMetricsUpdaterImpl.cc b/rdc_libs/rdc/src/RdcMetricsUpdaterImpl.cc index cf7dd07a28..2bc61ec590 100644 --- a/rdc_libs/rdc/src/RdcMetricsUpdaterImpl.cc +++ b/rdc_libs/rdc/src/RdcMetricsUpdaterImpl.cc @@ -36,11 +36,20 @@ RdcMetricsUpdaterImpl::RdcMetricsUpdaterImpl( , _check_frequency(check_frequency) { } +// Make the listen time for notifications a relatively long time. +// There's no point in starting/stopping it constantly. +static const uint32_t kRdcFieldListenNotifTime_mS = 10000; + void RdcMetricsUpdaterImpl::start() { if (started_) { return; } started_ = true; + notif_updater_ = std::async(std::launch::async, [this](){ + while (started_) { + watch_table_->rdc_field_listen_notif(kRdcFieldListenNotifTime_mS); + } + }); updater_ = std::async(std::launch::async, [this](){ while (started_) { watch_table_->rdc_field_update_all(); diff --git a/rdc_libs/rdc/src/RdcNotificationImpl.cc b/rdc_libs/rdc/src/RdcNotificationImpl.cc new file mode 100644 index 0000000000..d3808ecc40 --- /dev/null +++ b/rdc_libs/rdc/src/RdcNotificationImpl.cc @@ -0,0 +1,182 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include +#include +#include + +#include +#include +#include + +#include "rdc/rdc.h" +#include "rdc_lib/impl/RdcTelemetryModule.h" +#include "rdc_lib/impl/RdcNotificationImpl.h" +#include "rdc_lib/impl/RsmiUtils.h" +#include "rdc_lib/RdcLogger.h" +#include "rdc_lib/impl/RdcSmiLib.h" +#include "rocm_smi/rocm_smi.h" + +namespace amd { +namespace rdc { + +static std::unordered_map + rdc_2_rsmi_event_notif_map = { + {RDC_EVNT_NOTIF_VMFAULT, RSMI_EVT_NOTIF_VMFAULT}, + {RDC_EVNT_NOTIF_FIRST, RSMI_EVT_NOTIF_FIRST}, + {RDC_EVNT_NOTIF_THERMAL_THROTTLE, RSMI_EVT_NOTIF_THERMAL_THROTTLE}, + {RDC_EVNT_NOTIF_PRE_RESET, RSMI_EVT_NOTIF_GPU_PRE_RESET}, + {RDC_EVNT_NOTIF_POST_RESET, RSMI_EVT_NOTIF_GPU_POST_RESET}, +}; +static std::unordered_map + rsmi_event_notif_2_rdc_map = { + {RSMI_EVT_NOTIF_VMFAULT, RDC_EVNT_NOTIF_VMFAULT}, + {RSMI_EVT_NOTIF_FIRST, RDC_EVNT_NOTIF_FIRST}, + {RSMI_EVT_NOTIF_THERMAL_THROTTLE, RDC_EVNT_NOTIF_THERMAL_THROTTLE}, + {RSMI_EVT_NOTIF_GPU_PRE_RESET, RDC_EVNT_NOTIF_PRE_RESET}, + {RSMI_EVT_NOTIF_GPU_POST_RESET, RDC_EVNT_NOTIF_POST_RESET}, +}; + +// This const determines space allocated on stack for notification events. +const uint32_t kMaxRSMIEvents = 64; + +RdcNotificationImpl::RdcNotificationImpl() { +} + +RdcNotificationImpl::~RdcNotificationImpl() { +} + +bool +RdcNotificationImpl::is_notification_event(rdc_field_t field) const { + if (rdc_2_rsmi_event_notif_map.find(field) == + rdc_2_rsmi_event_notif_map.end()) { + return false; + } + return true; +} + +rdc_status_t +RdcNotificationImpl::set_listen_events(const std::vector fk_arr) { + rsmi_status_t ret; + std::map new_masks; + + for (uint32_t i = 0; i < fk_arr.size(); ++i) { + if (rdc_2_rsmi_event_notif_map.find(fk_arr[i].second) == + rdc_2_rsmi_event_notif_map.end()) { + continue; + } + new_masks[fk_arr[i].first] |= + RSMI_EVENT_MASK_FROM_INDEX(rdc_2_rsmi_event_notif_map[fk_arr[i].second]); + } + + std::map::iterator it = new_masks.begin(); + + std::lock_guard guard(notif_mutex_); + for (; it != new_masks.end(); ++it) { + if (it->second == gpu_evnt_notif_masks_[it->first]) { + // No change to mask; nothing to be done + continue; + } + ret = rsmi_event_notification_init(it->first); + if (ret != RSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_INFO, + "rsmi_event_notification_init() returned " << ret << " for device " << + it->first << ". " << std::endl << + " Will not listen for events on this device"); + continue; + } + + ret = rsmi_event_notification_mask_set(it->first, it->second); + + if (ret == RSMI_STATUS_SUCCESS) { + gpu_evnt_notif_masks_[it->first] = it->second; + RDC_LOG(RDC_INFO, "Event notification mask for gpu " << it->first << + "is set to 0x" << std::hex << it->second); + } else { + RDC_LOG(RDC_INFO, "rsmi_event_notification_mask_set() returned " << ret + << " for device " << it->first); + return Rsmi2RdcError(ret); + } + } + return RDC_ST_OK; +} + +// Blocking +rdc_status_t +RdcNotificationImpl::listen(rdc_evnt_notification_t *events, + uint32_t *num_events, uint32_t timeout_ms) { + if (events == nullptr || *num_events == 0) { + return RDC_ST_BAD_PARAMETER; + } + + uint32_t f_cnt = std::min(*num_events, kMaxRSMIEvents); + rsmi_evt_notification_data_t rsmi_events[kMaxRSMIEvents]; + + rsmi_status_t ret = + rsmi_event_notification_get(timeout_ms, &f_cnt, rsmi_events); + + if (ret != RSMI_STATUS_SUCCESS) { + return Rsmi2RdcError(ret); + } + struct timeval tv; + gettimeofday(&tv, NULL); + uint64_t now = static_cast(tv.tv_sec)*1000+tv.tv_usec/1000; + *num_events = f_cnt; + + for (uint32_t i = 0; i < f_cnt; ++i) { + assert(rsmi_event_notif_2_rdc_map.find(rsmi_events[i].event) != + rsmi_event_notif_2_rdc_map.end()); + events[i].gpu_id = rsmi_events[i].dv_ind; + events[i].field.field_id = rsmi_event_notif_2_rdc_map[rsmi_events[i].event]; + events[i].field.status = RDC_ST_OK; + events[i].field.ts = now; + events[i].field.type = STRING; + strncpy_with_null(events[i].field.value.str, + rsmi_events[i].message, RDC_MAX_STR_LENGTH); + } + + return RDC_ST_OK; +} + +rdc_status_t +RdcNotificationImpl::stop_listening(uint32_t gpu_id) { + rsmi_status_t ret; + + ret = rsmi_event_notification_mask_set(gpu_id, 0); + if (ret != RSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_INFO, "rsmi_event_notification_mask_set() returned " << ret + << " for device " << gpu_id); + } + + ret = rsmi_event_notification_stop(gpu_id); + if (ret == RSMI_STATUS_SUCCESS) { + std::lock_guard guard(notif_mutex_); + gpu_evnt_notif_masks_[gpu_id] = 0; + } else { + RDC_LOG(RDC_INFO, "rsmi_event_notification_stop() returned " << ret + << " for device " << gpu_id); + } + return RDC_ST_OK; +} + + +} // namespace rdc +} // namespace amd diff --git a/rdc_libs/rdc/src/RdcWatchTableImpl.cc b/rdc_libs/rdc/src/RdcWatchTableImpl.cc index 692dd5e93b..93cbbecad9 100644 --- a/rdc_libs/rdc/src/RdcWatchTableImpl.cc +++ b/rdc_libs/rdc/src/RdcWatchTableImpl.cc @@ -38,10 +38,12 @@ namespace rdc { RdcWatchTableImpl::RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings, const RdcCacheManagerPtr& cache_mgr, - const RdcModuleMgrPtr& module_mgr): + const RdcModuleMgrPtr& module_mgr, + const RdcNotificationPtr& notif): group_settings_(group_settings) , cache_mgr_(cache_mgr) , rdc_module_mgr_(module_mgr) + , notifications_(notif) , last_cleanup_time_(0) { } @@ -209,6 +211,13 @@ rdc_status_t RdcWatchTableImpl::rdc_field_watch(rdc_gpu_group_t group_id, return result; } + // See if any of the fields are notification fields, and + // set them up, if so. + result = notifications_->set_listen_events(fields_in_watch); + if (result != RDC_ST_OK) { + RDC_LOG(RDC_DEBUG, + "Error in configuring for event notification. Return " << result); + } // Skip not supported fields uint32_t unsupported_fields = 0; auto rdc_telemetry = rdc_module_mgr_->get_telemetry_module(); @@ -230,15 +239,17 @@ rdc_status_t RdcWatchTableImpl::rdc_field_watch(rdc_gpu_group_t group_id, } } if (not_supported) { + if (!notifications_->is_notification_event(it->second)) { + unsupported_fields++; + } it = fields_in_watch.erase(it); - unsupported_fields++; } else { it++; } } // end for } // end if } - if ( unsupported_fields >0 ) { + if (unsupported_fields > 0) { RDC_LOG(RDC_DEBUG, "Skip watch " << unsupported_fields <<" fields as they are not supported."); } @@ -337,6 +348,12 @@ rdc_status_t RdcWatchTableImpl::update_field_in_table_when_unwatch( auto fite = fields.begin(); std::vector unwatch_fields; for (; fite != fields.end(); fite++) { + // Turn off any notification fields + if (notifications_->is_notification_event(fite->second)) { + notifications_->stop_listening(fite->first); + continue; + } + auto f_in_table = fields_to_watch_.find((*fite)); if (f_in_table == fields_to_watch_.end()) { // Not in fields_to_watch_ unwatch_fields.push_back({fite->first, fite->second}); @@ -468,7 +485,7 @@ rdc_status_t RdcWatchTableImpl::rdc_field_update_all() { } // Clean up is expensive, only do it once per second - if (now - last_cleanup_time_ >1000) { + if (now - last_cleanup_time_ > 1000) { clean_up(); last_cleanup_time_ = now; } @@ -476,6 +493,55 @@ rdc_status_t RdcWatchTableImpl::rdc_field_update_all() { return RDC_ST_OK; } +rdc_status_t +RdcWatchTableImpl::rdc_notif_update_cache( + rdc_evnt_notification_t *events, uint32_t num_events) { + if (events == nullptr || num_events == 0) { + return RDC_ST_BAD_PARAMETER; + } + std::lock_guard guard(watch_mutex_); + + for (uint32_t i = 0; i < num_events; i++) { + auto gpu_index = events[i].gpu_id; + auto field_id = events[i].field.field_id; + + // Always Update the timestamp + auto ite = fields_to_watch_.find({gpu_index, field_id}); + if (ite != fields_to_watch_.end()) { + ite->second.last_update_time = events[i].field.ts; + } + + // Only cache valid results + if (events[i].field.status != RDC_ST_OK) { + continue; + } + + // Update the cache + cache_mgr_->rdc_update_cache(gpu_index, events[i].field); + + // Update the job stats cache + std::string job_id; + if (is_job_watch_field(gpu_index, field_id, job_id)) { + cache_mgr_->rdc_update_job_stats(gpu_index, job_id, events[i].field); + } + } + return RDC_ST_OK; +} + +rdc_status_t RdcWatchTableImpl::rdc_field_listen_notif(uint32_t timeout_ms) { + rdc_status_t ret; + rdc_evnt_notification_t events[kMaxRSMIEvents]; + uint32_t num_events = kMaxRSMIEvents; + + ret = notifications_->listen(events, &num_events, timeout_ms); + + // Update cache + if (ret == RDC_ST_OK && num_events) { + ret = rdc_notif_update_cache(events, num_events); + } + return ret; +} + void RdcWatchTableImpl::clean_up() { struct timeval tv; gettimeofday(&tv, NULL); diff --git a/rdc_libs/rdc/src/RsmiUtils.cc b/rdc_libs/rdc/src/RsmiUtils.cc new file mode 100644 index 0000000000..c6dff35b47 --- /dev/null +++ b/rdc_libs/rdc/src/RsmiUtils.cc @@ -0,0 +1,73 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "rocm_smi/rocm_smi.h" +#include "rdc/rdc.h" + +namespace amd { +namespace rdc { + +rdc_status_t Rsmi2RdcError(rsmi_status_t rsmi) { + switch (rsmi) { + case RSMI_STATUS_SUCCESS: + return RDC_ST_OK; + + case RSMI_STATUS_INVALID_ARGS: + return RDC_ST_BAD_PARAMETER; + + case RSMI_STATUS_NOT_SUPPORTED: + return RDC_ST_NOT_SUPPORTED; + + case RSMI_STATUS_NOT_FOUND: + return RDC_ST_NOT_FOUND; + + case RSMI_STATUS_OUT_OF_RESOURCES: + return RDC_ST_INSUFF_RESOURCES; + + case RSMI_STATUS_FILE_ERROR: + return RDC_ST_FILE_ERROR; + + case RSMI_STATUS_NO_DATA: + return RDC_ST_NO_DATA; + + case RSMI_STATUS_PERMISSION: + return RDC_ST_PERM_ERROR; + + case RSMI_STATUS_BUSY: + case RSMI_STATUS_UNKNOWN_ERROR: + case RSMI_STATUS_INTERNAL_EXCEPTION: + case RSMI_STATUS_INPUT_OUT_OF_BOUNDS: + case RSMI_STATUS_INIT_ERROR: + case RSMI_STATUS_NOT_YET_IMPLEMENTED: + case RSMI_STATUS_INSUFFICIENT_SIZE: + case RSMI_STATUS_INTERRUPT: + case RSMI_STATUS_UNEXPECTED_SIZE: + case RSMI_STATUS_UNEXPECTED_DATA: + case RSMI_STATUS_REFCOUNT_OVERFLOW: + default: + return RDC_ST_UNKNOWN_ERROR; + } +} + +} // namespace rdc +} // namespace amd + diff --git a/rdci/include/RdciDmonSubSystem.h b/rdci/include/RdciDmonSubSystem.h index d9d44b00f0..aa2ad3f3b1 100644 --- a/rdci/include/RdciDmonSubSystem.h +++ b/rdci/include/RdciDmonSubSystem.h @@ -65,7 +65,8 @@ class RdciDmonSubSystem: public RdciSubSystem { std::vector field_ids_; std::vector gpu_indexes_; bool need_cleanup_; - + uint64_t latest_time_stamp_; + bool show_timpstamps_; static volatile sig_atomic_t is_terminating_; static void set_terminating(int sig); }; diff --git a/rdci/src/RdciDmonSubSystem.cc b/rdci/src/RdciDmonSubSystem.cc index 47777e9f74..03a9627de2 100644 --- a/rdci/src/RdciDmonSubSystem.cc +++ b/rdci/src/RdciDmonSubSystem.cc @@ -23,11 +23,16 @@ THE SOFTWARE. #include #include #include +#include + #include #include #include #include #include +#include +#include +#include #include "rdc_lib/rdc_common.h" #include "common/rdc_utils.h" @@ -44,7 +49,8 @@ volatile sig_atomic_t RdciDmonSubSystem::is_terminating_ = 0; RdciDmonSubSystem::RdciDmonSubSystem(): dmon_ops_(DMON_MONITOR) - , need_cleanup_(false) { + , need_cleanup_(false) + , show_timpstamps_(false) { signal(SIGINT, set_terminating); } @@ -66,6 +72,7 @@ void RdciDmonSubSystem::parse_cmd_opts(int argc, char ** argv) { {"help", optional_argument, nullptr, 'h'}, {"unauth", optional_argument, nullptr, 'u'}, {"list", optional_argument, nullptr, 'l'}, + {"time-stamp", optional_argument, nullptr, 't'}, {"list-all", optional_argument, nullptr, LIST_ALL_FIELDS_OPT}, {"field-group-id", required_argument, nullptr, 'f'}, {"field-id", required_argument, nullptr, 'e' }, @@ -81,7 +88,7 @@ void RdciDmonSubSystem::parse_cmd_opts(int argc, char ** argv) { std::string gpu_indexes; std::string field_ids; - while ((opt = getopt_long(argc, argv, "hluf:g:c:d:e:i:", + while ((opt = getopt_long(argc, argv, "hltuf:g:c:d:e:i:", long_options, &option_index)) != -1) { switch (opt) { case HOST_OPTIONS: @@ -93,6 +100,9 @@ void RdciDmonSubSystem::parse_cmd_opts(int argc, char ** argv) { case 'u': use_auth_ = false; break; + case 't': + show_timpstamps_ = true; + break; case 'l': dmon_ops_ = DMON_LIST_FIELDS; break; @@ -253,6 +263,8 @@ void RdciDmonSubSystem::show_help() const { << "names and \n" << " descriptions of the field " << "ids\n"; + std::cout << " -t --time-stamp Include timestamps in " + << "display\n"; std::cout << " --list-all Same as -l, except this " << "lists all possible\n" << " fields, including " @@ -327,6 +339,106 @@ void RdciDmonSubSystem::show_field_usage() const { "used in scripts." << std::endl; } +static void separate_notf_events(const rdc_field_group_info_t *f_info, + std::vector *notif, + std::vector *reg_ev) { + assert(f_info != nullptr && notif != nullptr && reg_ev != nullptr); + + for (uint32_t i = 0; i < f_info->count; ++i) { + if (RDC_EVNT_IS_NOTIF_FIELD(f_info->field_ids[i])) { + notif->push_back(f_info->field_ids[i]); + } else { + reg_ev->push_back(f_info->field_ids[i]); + } + } +} + +typedef struct { + uint32_t dev_ind; + rdc_field_value val; +} notif_dev_value; + +struct Compare_ts { + bool operator()(const notif_dev_value& r1, const notif_dev_value& r2) { + return r1.val.ts > r2.val.ts; + } +}; + +typedef std::priority_queue, Compare_ts> field_pq_t; + +static void collect_new_notifs(rdc_handle_t h, + const rdc_group_info_t &group_info, + const std::vector ¬if_fields, + std::vector *notif_ts, field_pq_t *notif_pq) { + rdc_status_t ret; + notif_dev_value value; + std::string error_msg; + uint64_t next_ts; + + assert(notif_ts != nullptr); + + for (uint32_t gindex = 0; gindex < group_info.count; gindex++) { + for (uint32_t findex = 0; findex < notif_fields.size(); findex++) { + // There may be multiple, repeated events; get all of them + while (true) { + ret = rdc_field_get_value_since(h, group_info.entity_ids[gindex], + notif_fields[findex], (*notif_ts)[findex], &next_ts, &value.val); + + if (ret == RDC_ST_NOT_FOUND) { + break; + } else if (ret == RDC_ST_OK) { + (*notif_ts)[findex] = next_ts; + value.dev_ind = group_info.entity_ids[gindex]; + if (notif_pq != nullptr) { + notif_pq->push(value); + } + } else { + error_msg = "rdc_field_get_value_since() failed"; + throw RdcException(ret, error_msg.c_str()); + } + } + } + } +} + +// ts is milliseconds +static std::string ts_string(const time_t ts) { + struct tm *timeinfo; + time_t tmp_ts = ts/1000; + std::string ret; + + timeinfo = localtime(&tmp_ts); // NOLINT + + ret = asctime(timeinfo); // NOLINT + ret.pop_back(); + return ret; +} + +static void print_and_clr_notif_pq(field_pq_t *notif_pq, bool ts) { + assert(notif_pq != nullptr); + notif_dev_value v; + amd::rdc::fld_id2name_map_t &field_id_to_descript = + amd::rdc::get_field_id_description_from_id(); + while (!notif_pq->empty()) { + v = notif_pq->top(); + notif_pq->pop(); + + std::cout << v.dev_ind << "\t"; + + if (ts) { + std::cout << std::left << std::setw(25) << + ts_string(v.val.ts); + } + + std::cout << std::left << " **Event: " << + field_id_to_descript.at(v.val.field_id).label; + std::cout << std::left << "\t\"" << v.val.value.str << "\""; + + std::cout << std::endl; + } +} + void RdciDmonSubSystem::process() { if (dmon_ops_ == DMON_HELP || dmon_ops_ == DMON_UNKNOWN) { @@ -377,6 +489,11 @@ void RdciDmonSubSystem::process() { std::to_string(options_[OPTIONS_FIELD_GROUP_ID]) + " must contain at least 1 field."); } + // Divide field_info fields into 2 vectors, 1 for notifications + // and one for non-notifications. Handle these separately below. + std::vector notif_fields; + std::vector reg_fields; + separate_notf_events(&field_info, ¬if_fields, ®_fields); // keep extra 1 minute data double max_keep_age = options_[OPTIONS_DELAY]/1000.0 + 60; @@ -385,25 +502,66 @@ void RdciDmonSubSystem::process() { options_[OPTIONS_GROUP_ID], options_[OPTIONS_FIELD_GROUP_ID], options_[OPTIONS_DELAY]*1000, max_keep_age, max_keep_samples); need_cleanup_ = true; - std::cout << "GPU\t"; - for (uint32_t findex = 0; findex < field_info.count; findex++) { - std::cout << std::left << std::setw(20) - << field_id_string(field_info.field_ids[findex]); + + std::stringstream ss; + amd::rdc::fld_id2name_map_t &field_id_to_descript = + amd::rdc::get_field_id_description_from_id(); + + ss << "Listening for events: "; + uint32_t i; + for (i = 0; i < notif_fields.size() - 1; ++i) { + ss << field_id_to_descript.at(notif_fields[i]).label << ", "; } - std::cout << std::endl; + ss << field_id_to_descript.at(notif_fields[i]).label << std::endl; + ss << "GPU\t"; + if (show_timpstamps_) { + ss << std::left << std::setw(25) << "TIMESTAMP"; + ss << " "; + } + for (uint32_t findex = 0; findex < reg_fields.size(); findex++) { + ss << std::left << std::setw(20) + << field_id_string(reg_fields[findex]); + } + ss << std::endl; + + std::string header_line((std::istreambuf_iterator(ss)), + (std::istreambuf_iterator())); + + std::vector notif_ts(notif_fields.size()); + field_pq_t notif_pq; + + // Call this once without printing out notfications to initialize + // timestamps. There may be very stale timestamps in cache. + collect_new_notifs(rdc_handle_, group_info, notif_fields, + ¬if_ts, nullptr); for (uint32_t i = 0; i < options_[OPTIONS_COUNT]; i++) { + if (i % 50 == 0) { + std::cout << header_line; + } + usleep(options_[OPTIONS_DELAY]*1000); + + collect_new_notifs(rdc_handle_, group_info, notif_fields, + ¬if_ts, ¬if_pq); + + print_and_clr_notif_pq(¬if_pq, show_timpstamps_); + for (uint32_t gindex = 0; gindex < group_info.count; gindex++) { std::cout << group_info.entity_ids[gindex] << "\t"; - for (uint32_t findex = 0; findex < field_info.count; findex++) { + for (uint32_t findex = 0; findex < reg_fields.size(); findex++) { rdc_field_value value; + result = rdc_field_get_latest_value(rdc_handle_, - group_info.entity_ids[gindex], - field_info.field_ids[findex], &value); + group_info.entity_ids[gindex], reg_fields[findex], &value); if (result != RDC_ST_OK) { std::cout << std::left << std::setw(20) << "N/A"; } else { + if (show_timpstamps_ && findex == 0) { + std::cout << std::left << std::setw(25) << + ts_string(value.ts) << " "; + } + if (value.type == INTEGER) { std::cout << std::left << std::setw(20) << value.value.l_int; @@ -421,7 +579,9 @@ void RdciDmonSubSystem::process() { return; } } - std::cout << std::endl; + if (reg_fields.size()) { + std::cout << std::endl; + } } } diff --git a/server/src/rdc_api_service.cc b/server/src/rdc_api_service.cc index d798ff0199..33684def7b 100755 --- a/server/src/rdc_api_service.cc +++ b/server/src/rdc_api_service.cc @@ -385,7 +385,10 @@ RdcAPIServiceImpl::~RdcAPIServiceImpl() { } else if (value.type == DOUBLE) { reply->set_dbl(value.value.dbl); } else if (value.type == STRING || value.type == BLOB) { - reply->set_str(value.value.str); + std::string val_str(value.value.str); + size_t endpos = val_str.find_last_not_of(" "); + val_str[endpos + 1] = '\0'; + reply->set_str(val_str); } return ::grpc::Status::OK; diff --git a/server/src/rdc_server_main.cc b/server/src/rdc_server_main.cc index ee169a300b..aaf67f54b5 100755 --- a/server/src/rdc_server_main.cc +++ b/server/src/rdc_server_main.cc @@ -317,7 +317,6 @@ RDCServer::ShutDown(void) { delete api_service_; api_service_ = nullptr; } - } static void * ProcessSignalLoop(void *server_ptr) { diff --git a/tests/rdc_tests/test_base.cc b/tests/rdc_tests/test_base.cc index 87ed3a3872..26667eb8d6 100755 --- a/tests/rdc_tests/test_base.cc +++ b/tests/rdc_tests/test_base.cc @@ -38,7 +38,7 @@ rdc_status_t result; /*TestBase::TestBase() : description_(""), rdc_channel_(0) { }*/ -TestBase::TestBase() : description_(""){ +TestBase::TestBase() : description_("") { } TestBase::~TestBase() { } @@ -99,7 +99,6 @@ void TestBase::Close(void) { MakeHeaderStr(kCloseLabel, &label); printf("\n\t%s\n", label.c_str()); - } void TestBase::DisplayResults(void) const {