From 0813e7052f355cbe922db3e1bb3af1cc39bc0209 Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Wed, 8 Apr 2020 08:47:29 -0400 Subject: [PATCH] Implement the rdc_lib API to support the job stats Add the function to start and stop the job recording. Add the function to get the job stats for each GPU and summary of multiple GPUs Add the function to remove the jobs. Add a class RdcLogger which can control the log level using the environment variable RDC_LOG. This is similar to GRPC_VERBOSITY gRPC. When the customer has the issues, he can enable the verbose log to help us to troubleshoot the issues. Add the -u support in the rdci group, fieldgroup and dmon for connecting to rdcd without authentication. Change-Id: I22c591823c1ee6485db106b911bed8271d1b2769 [ROCm/rdc commit: a547dc7efd818bdadf5e3f7aa2c197d85bd52d70] --- projects/rdc/README.md | 7 + projects/rdc/example/job_stats_example.cc | 6 +- projects/rdc/include/rdc/rdc.h | 37 +++- .../rdc/include/rdc_lib/RdcCacheManager.h | 16 +- .../rdc/include/rdc_lib/RdcGroupSettings.h | 2 +- projects/rdc/include/rdc_lib/RdcHandler.h | 10 +- projects/rdc/include/rdc_lib/RdcLogger.h | 59 +++++ projects/rdc/include/rdc_lib/RdcWatchTable.h | 8 +- .../rdc_lib/impl/RdcCacheManagerImpl.h | 45 +++- .../include/rdc_lib/impl/RdcEmbeddedHandler.h | 9 +- .../rdc_lib/impl/RdcGroupSettingsImpl.h | 4 +- .../rdc_lib/impl/RdcStandaloneHandler.h | 9 +- .../include/rdc_lib/impl/RdcWatchTableImpl.h | 27 ++- projects/rdc/include/rdc_lib/rdc_common.h | 16 +- projects/rdc/rdc_libs/CMakeLists.txt | 2 + .../rdc_libs/bootstrap/src/RdcBootStrap.cc | 33 ++- .../rdc/rdc_libs/bootstrap/src/RdcLogger.cc | 78 +++++++ .../rdc_libs/rdc/src/RdcCacheManagerImpl.cc | 204 +++++++++++++++++- .../rdc_libs/rdc/src/RdcEmbeddedHandler.cc | 68 ++++-- .../rdc_libs/rdc/src/RdcGroupSettingsImpl.cc | 25 ++- .../rdc_libs/rdc/src/RdcMetricFetcherImpl.cc | 26 ++- .../rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc | 171 +++++++++++++-- .../rdc_client/src/RdcStandaloneHandler.cc | 21 +- projects/rdc/rdci/src/RdciDmonSubSystem.cc | 12 +- .../rdc/rdci/src/RdciFieldGroupSubSystem.cc | 6 +- projects/rdc/rdci/src/RdciGroupSubSystem.cc | 6 +- 26 files changed, 805 insertions(+), 102 deletions(-) create mode 100644 projects/rdc/include/rdc_lib/RdcLogger.h create mode 100644 projects/rdc/rdc_libs/bootstrap/src/RdcLogger.cc diff --git a/projects/rdc/README.md b/projects/rdc/README.md index b145a5d1d7..2126779f73 100644 --- a/projects/rdc/README.md +++ b/projects/rdc/README.md @@ -16,9 +16,16 @@ LD_LIBRARY_PATH=$PWD/rdc_libs/ ./rdci/rdci discovery -l ``` ## Troubleshooting +Enable the debug log: +``` +sudo RDC_LOG=DEBUG ./server/rdcd +``` + Check the ssl connection in rdci: ``` rdcd_hostname= "" # Set the rdcd you want to connect openssl s_client -connect $rdcd_hostname:50051 -cert /etc/rdc/client/certs/rdc_client_cert.pem -key /etc/rdc/client/private/rdc_client_cert.key -CAfile /etc/rdc/client/certs/rdc_cacert.pem ``` + + diff --git a/projects/rdc/example/job_stats_example.cc b/projects/rdc/example/job_stats_example.cc index 11c99ea127..2d9493f9d7 100644 --- a/projects/rdc/example/job_stats_example.cc +++ b/projects/rdc/example/job_stats_example.cc @@ -92,11 +92,9 @@ int main(int, char **) { goto cleanup; } - // (2) start the recording. Set the sample frequency to once per second, the - // max keep age to one hour and the maximum number of samples to - // keep to unlimited. + // (2) start the recording. Set the sample frequency to once per second. result = rdc_job_start_stats(rdc_handle, group_id, - job_id, 1000000, 3600, 0); + job_id, 1000000); if (result != RDC_ST_OK) { std::cout << "Error start job stats. Return: " << rdc_status_string(result); diff --git a/projects/rdc/include/rdc/rdc.h b/projects/rdc/include/rdc/rdc.h index e1674a4e22..a1066a2666 100755 --- a/projects/rdc/include/rdc/rdc.h +++ b/projects/rdc/include/rdc/rdc.h @@ -52,6 +52,7 @@ typedef enum { RDC_ST_NOT_FOUND, //!< Cannot find the value RDC_ST_CONFLICT, //!< Conflict with current state RDC_ST_CLIENT_ERROR, //!< The RDC client error + RDC_ST_ALREADY_EXIST, //!< The item already exists RDC_ST_MAX_LIMIT //!< Max limit recording for the object } rdc_status_t; @@ -371,15 +372,10 @@ rdc_status_t rdc_disconnect(rdc_handle_t p_rdc_handle); * * @param[in] update_freq How often to update this field in usec. * - * @param[in] max_keep_age How long to keep data for this field in seconds. - * - * @param[in] max_keep_samples Maximum number of samples to keep. 0=no limit. - * * @retval ::RDC_ST_OK is returned upon successful call. */ rdc_status_t rdc_job_start_stats(rdc_handle_t p_rdc_handle, - rdc_gpu_group_t group_id, char job_id[64], uint64_t update_freq, - double max_keep_age, uint32_t max_keep_samples); + rdc_gpu_group_t group_id, char job_id[64], uint64_t update_freq); /** * @brief Get the stats of the job using the job id. @@ -415,6 +411,35 @@ rdc_status_t rdc_job_get_stats(rdc_handle_t p_rdc_handle, char job_id[64], rdc_status_t rdc_job_stop_stats(rdc_handle_t p_rdc_handle, char job_id[64]); +/** + * @brief Request RDC to stop tracking the job given by job_id + * + * @details After this call, you will no longer be able to call + * rdc_job_get_stats() on this job_id. But you will be able to reuse + * the job_id after this call. + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] job_id The name of the job. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_job_remove(rdc_handle_t p_rdc_handle, + char job_id[64]); + +/** + * @brief Request RDC to stop tracking all the jobs + * + * @details After this call, you will no longer be able to call + * rdc_job_get_stats() on any job id. But you will be able to reuse + * the any previous used job id after this call. + * + * @param[in] p_rdc_handle The RDC handler. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_job_remove_all(rdc_handle_t p_rdc_handle); + /** * @brief Request RDC to update all fields to be watched. * diff --git a/projects/rdc/include/rdc_lib/RdcCacheManager.h b/projects/rdc/include/rdc_lib/RdcCacheManager.h index 9948dd2c5e..77eab78123 100644 --- a/projects/rdc/include/rdc_lib/RdcCacheManager.h +++ b/projects/rdc/include/rdc_lib/RdcCacheManager.h @@ -24,6 +24,7 @@ THE SOFTWARE. #include #include +#include #include #include #include "rdc_lib/rdc_common.h" @@ -31,6 +32,7 @@ THE SOFTWARE. namespace amd { namespace rdc { +typedef std::map rdc_gpu_total_memory_t; class RdcCacheManager { public: @@ -43,7 +45,19 @@ class RdcCacheManager { const rdc_field_value& value) = 0; virtual rdc_status_t evict_cache(uint32_t gpu_index, uint32_t field_id, uint64_t max_keep_samples, double max_keep_age) = 0; - virtual uint32_t get_cache_size() = 0; + virtual std::string get_cache_stats() = 0; + + virtual rdc_status_t rdc_job_get_stats(char jobId[64], + const rdc_gpu_total_memory_t& total_memory, + rdc_job_info_t* p_job_info) = 0; + virtual rdc_status_t rdc_job_start_stats(char jobId[64], + const rdc_group_info_t& group, + const rdc_field_group_info_t& finfo) = 0; + virtual rdc_status_t rdc_job_stop_stats(char job_id[64]) = 0; + virtual rdc_status_t rdc_update_job_stats(uint32_t gpu_index, + const std::string& job_id, const rdc_field_value& value) = 0; + virtual rdc_status_t rdc_job_remove(char job_id[64]) = 0; + virtual rdc_status_t rdc_job_remove_all() = 0; virtual ~RdcCacheManager() {} }; diff --git a/projects/rdc/include/rdc_lib/RdcGroupSettings.h b/projects/rdc/include/rdc_lib/RdcGroupSettings.h index 2dd0d321ee..89d6f4ed22 100644 --- a/projects/rdc/include/rdc_lib/RdcGroupSettings.h +++ b/projects/rdc/include/rdc_lib/RdcGroupSettings.h @@ -59,7 +59,7 @@ class RdcGroupSettings { }; typedef std::shared_ptr RdcGroupSettingsPtr; - +const uint32_t JOB_FIELD_ID = 0; } // namespace rdc } // namespace amd diff --git a/projects/rdc/include/rdc_lib/RdcHandler.h b/projects/rdc/include/rdc_lib/RdcHandler.h index 7c5d927641..c5071bafaa 100644 --- a/projects/rdc/include/rdc_lib/RdcHandler.h +++ b/projects/rdc/include/rdc_lib/RdcHandler.h @@ -33,12 +33,12 @@ class RdcHandler { public: // Job API virtual rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId, - char job_id[64], uint64_t update_freq, double max_keep_age, - uint32_t max_keep_samples) = 0; - virtual rdc_status_t rdc_job_get_stats(char jobId[64], + char job_id[64], uint64_t update_freq) = 0; + virtual rdc_status_t rdc_job_get_stats(char jobId[64], rdc_job_info_t* p_job_info)= 0; - virtual rdc_status_t rdc_job_stop_stats(char job_id[64]) = 0; - + virtual rdc_status_t rdc_job_stop_stats(char job_id[64]) = 0; + virtual rdc_status_t rdc_job_remove(char job_id[64]) = 0; + virtual rdc_status_t rdc_job_remove_all() = 0; // Discovery API virtual rdc_status_t rdc_device_get_all( diff --git a/projects/rdc/include/rdc_lib/RdcLogger.h b/projects/rdc/include/rdc_lib/RdcLogger.h new file mode 100644 index 0000000000..b2d891b73e --- /dev/null +++ b/projects/rdc/include/rdc_lib/RdcLogger.h @@ -0,0 +1,59 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef RDC_LIB_RDCLOGGER_H_ +#define RDC_LIB_RDCLOGGER_H_ +#include +#include +#include // NOLINT + +namespace amd { +namespace rdc { +class RdcLogger { + public: + explicit RdcLogger(std::ostream& os); + + static RdcLogger& getLogger() { + static RdcLogger logger(std::cout); + return logger; + } + + bool should_log(uint32_t severity) { + return log_level_ >= severity; + } + + std::ostream& get_ostream() { + return os_; + } + + std::string get_log_header(uint32_t severity, + const char* file, int line); + + private: + std::ostream& os_; + uint32_t log_level_; +}; + +} // namespace rdc +} // namespace amd + + +#endif // RDC_LIB_RDCLOGGER_H_ diff --git a/projects/rdc/include/rdc_lib/RdcWatchTable.h b/projects/rdc/include/rdc_lib/RdcWatchTable.h index 67cbe61785..34f01750b0 100644 --- a/projects/rdc/include/rdc_lib/RdcWatchTable.h +++ b/projects/rdc/include/rdc_lib/RdcWatchTable.h @@ -36,10 +36,10 @@ class RdcWatchTable { virtual rdc_status_t rdc_field_update_all() = 0; virtual rdc_status_t rdc_job_start_stats(rdc_gpu_group_t group_id, - char job_id[64]) = 0; - virtual rdc_status_t rdc_watch_job_fields(rdc_gpu_group_t group_id, - uint64_t update_freq, double max_keep_age, - uint32_t max_keep_samples) = 0; + char job_id[64], uint64_t update_freq) = 0; + virtual rdc_status_t rdc_job_stop_stats(char job_id[64]) = 0; + virtual rdc_status_t rdc_job_remove(char job_id[64]) = 0; + virtual rdc_status_t rdc_job_remove_all() = 0; virtual rdc_status_t rdc_field_watch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id, uint64_t update_freq, diff --git a/projects/rdc/include/rdc_lib/impl/RdcCacheManagerImpl.h b/projects/rdc/include/rdc_lib/impl/RdcCacheManagerImpl.h index 454554e4aa..952f376657 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcCacheManagerImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcCacheManagerImpl.h @@ -23,7 +23,8 @@ THE SOFTWARE. #define RDC_LIB_IMPL_RDCCACHEMANAGERIMPL_H_ #include -#include +#include // NOLINT(build/c++11) +#include #include #include #include "rdc_lib/RdcCacheManager.h" @@ -41,6 +42,29 @@ struct RdcCacheEntry { typedef std::map> RdcCacheSamples; +struct FieldSummaryStats { + int64_t max_value; + int64_t min_value; + int64_t total_value; + uint64_t last_time; + uint64_t count; +}; + +struct GpuSummaryStats { + uint64_t energy_consumed; + uint64_t energy_last_time; + std::map field_summaries; +}; + +// Per job entry +struct RdcJobStatsCacheEntry { + uint64_t start_time; + uint64_t end_time; + std::map gpu_stats; +}; + +// +typedef std::map RdcJobStatsCache; class RdcCacheManagerImpl: public RdcCacheManager { public: @@ -53,10 +77,27 @@ class RdcCacheManagerImpl: public RdcCacheManager { const rdc_field_value& value) override; rdc_status_t evict_cache(uint32_t gpu_index, uint32_t field_id, uint64_t max_keep_samples, double max_keep_age) override; - uint32_t get_cache_size() override; + std::string get_cache_stats() override; + + rdc_status_t rdc_job_get_stats(char job_id[64], + const rdc_gpu_total_memory_t& total_memory, + rdc_job_info_t* p_job_info) override; + rdc_status_t rdc_job_start_stats(char job_id[64], + const rdc_group_info_t& group, + const rdc_field_group_info_t& finfo) override; + rdc_status_t rdc_job_stop_stats(char job_id[64]) override; + rdc_status_t rdc_update_job_stats(uint32_t gpu_index, + const std::string& job_id, + const rdc_field_value& value) override; + rdc_status_t rdc_job_remove(char job_id[64]) override; + rdc_status_t rdc_job_remove_all() override; private: + void set_summary(const FieldSummaryStats & stats, + rdc_stats_summary_t& gpu, rdc_stats_summary_t& summary, // NOLINT + unsigned int adjuster); RdcCacheSamples cache_samples_; + RdcJobStatsCache cache_jobs_; std::mutex cache_mutex_; }; diff --git a/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h b/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h index 8c5c2cf008..59e642993a 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h +++ b/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h @@ -36,11 +36,12 @@ class RdcEmbeddedHandler: public RdcHandler { public: // Job API rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId, - char job_id[64], uint64_t update_freq, double max_keep_age, - uint32_t max_keep_samples) override; - rdc_status_t rdc_job_get_stats(char jobId[64], + char job_id[64], uint64_t update_freq) override; + rdc_status_t rdc_job_get_stats(char jobId[64], rdc_job_info_t* p_job_info) override; - rdc_status_t rdc_job_stop_stats(char job_id[64]) override; + rdc_status_t rdc_job_stop_stats(char job_id[64]) override; + rdc_status_t rdc_job_remove(char job_id[64]) override; + rdc_status_t rdc_job_remove_all() override; // Discovery API rdc_status_t rdc_device_get_all( diff --git a/projects/rdc/include/rdc_lib/impl/RdcGroupSettingsImpl.h b/projects/rdc/include/rdc_lib/impl/RdcGroupSettingsImpl.h index 9480d2e7fe..d616df5762 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcGroupSettingsImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcGroupSettingsImpl.h @@ -61,8 +61,8 @@ class RdcGroupSettingsImpl: public RdcGroupSettings { private: std::map gpu_group_; std::map field_group_; - uint32_t cur_group_id_ = 0; - uint32_t cur_filed_group_id_ = 0; + uint32_t cur_group_id_ = 1; + uint32_t cur_field_group_id_ = 0; std::mutex group_mutex_; std::mutex field_group_mutex_; }; diff --git a/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h b/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h index 6e32a57c02..ea216993ae 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h +++ b/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h @@ -33,11 +33,12 @@ class RdcStandaloneHandler: public RdcHandler { public: // Job RdcAPI rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId, - char job_id[64], uint64_t update_freq, double max_keep_age, - uint32_t max_keep_samples) override; - rdc_status_t rdc_job_get_stats(char jobId[64], + char job_id[64], uint64_t update_freq) override; + rdc_status_t rdc_job_get_stats(char jobId[64], rdc_job_info_t* p_job_info) override; - rdc_status_t rdc_job_stop_stats(char job_id[64]) override; + rdc_status_t rdc_job_stop_stats(char job_id[64]) override; + rdc_status_t rdc_job_remove(char job_id[64]) override; + rdc_status_t rdc_job_remove_all() override; // Discovery RdcAPI rdc_status_t rdc_device_get_all( diff --git a/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h b/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h index e8f2568c2f..266fd91911 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h @@ -22,11 +22,12 @@ THE SOFTWARE. #ifndef RDC_LIB_IMPL_RDCWATCHTABLEIMPL_H_ #define RDC_LIB_IMPL_RDCWATCHTABLEIMPL_H_ +#include #include #include #include #include -#include +#include // NOLINT #include #include "rdc_lib/RdcWatchTable.h" #include "rdc_lib/RdcGroupSettings.h" @@ -45,14 +46,18 @@ struct FieldSettings { uint64_t last_update_time; }; +struct JobWatchTableEntry { + uint32_t group_id; + std::vector fields; //< store fields for faster query +}; class RdcWatchTableImpl : public RdcWatchTable { public: rdc_status_t rdc_job_start_stats(rdc_gpu_group_t group_id, - char job_id[64]) override; - rdc_status_t rdc_watch_job_fields(rdc_gpu_group_t group_id, - uint64_t update_freq, double max_keep_age, - uint32_t max_keep_samples) override; + char job_id[64], uint64_t update_freq) override; + rdc_status_t rdc_job_stop_stats(char job_id[64]) override; + rdc_status_t rdc_job_remove(char job_id[64]) override; + rdc_status_t rdc_job_remove_all() override; rdc_status_t rdc_field_watch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id, uint64_t update_freq, @@ -84,10 +89,16 @@ class RdcWatchTableImpl : public RdcWatchTable { //!< Helper function to clean up the watch table and cache void clean_up(); + //!< Helper function for debug information in watch table and cache + void debug_status(); + //!< Helper function to get the fields using the group and the field group. rdc_status_t get_fields_from_group(rdc_gpu_group_t group_id, - rdc_field_grp_t field_group_id, std::vector & fields); + rdc_field_grp_t field_group_id, + std::vector & fields); // NOLINT + bool is_job_watch_field(uint32_t gpu_index, uint32_t field_id, + std::string& job_id) const; // NOLINT RdcGroupSettingsPtr group_settings_; RdcCacheManagerPtr cache_mgr_; @@ -96,6 +107,10 @@ class RdcWatchTableImpl : public RdcWatchTable { //!< The watch table to store the watch settings. std::map watch_table_; + //!< pairs + std::map job_watch_table_; + + //!< The settings for each field can be deduced from watch_table. But every //!< rdc_field_update_all() call needs to deduce them. To improve the //!< performance, the fields_to_watch_ is used to track the field settings. diff --git a/projects/rdc/include/rdc_lib/rdc_common.h b/projects/rdc/include/rdc_lib/rdc_common.h index ca7b905dc8..d08ddae6fa 100644 --- a/projects/rdc/include/rdc_lib/rdc_common.h +++ b/projects/rdc/include/rdc_lib/rdc_common.h @@ -24,12 +24,18 @@ THE SOFTWARE. #define RDC_LIB_RDC_COMMON_H_ #include +#define RDC_ERROR 0 +#define RDC_INFO 1 +#define RDC_DEBUG 2 -#ifdef DEBUG -#define LOG_DEBUG(message) std::cout << message << std::endl -#else -#define LOG_DEBUG(message) -#endif +#define RDC_LOG(debug_level, msg) do { \ + auto& logger = amd::rdc::RdcLogger::getLogger(); \ + if (logger.should_log((debug_level))) { \ + logger.get_ostream() << \ + logger.get_log_header((debug_level), __FILE__, __LINE__) << \ + msg << std::endl; \ + } \ +} while (0) /** * @brief The strncpy but with null terminated diff --git a/projects/rdc/rdc_libs/CMakeLists.txt b/projects/rdc/rdc_libs/CMakeLists.txt index 2eab4b286c..7d1b39cb3c 100755 --- a/projects/rdc/rdc_libs/CMakeLists.txt +++ b/projects/rdc/rdc_libs/CMakeLists.txt @@ -115,8 +115,10 @@ set(CMAKE_VERBOSE_MAKEFILE on) set(BOOTSTRAP_LIB "rdc_bootstrap") set(BOOTSTRAP_LIB_COMPONENT "lib${BOOTSTRAP_LIB}") set(BOOTSTRAP_LIB_SRC_LIST "${SRC_DIR}/bootstrap/src/RdcBootStrap.cc") +set(BOOTSTRAP_LIB_SRC_LIST ${BOOTSTRAP_LIB_SRC_LIST} "${SRC_DIR}/bootstrap/src/RdcLogger.cc") set(BOOTSTRAP_LIB_INC_LIST "${RDC_LIB_INC_DIR}/rdc/rdc.h") set(BOOTSTRAP_LIB_INC_LIST ${BOOTSTRAP_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/rdc_common.h") +set(BOOTSTRAP_LIB_INC_LIST ${BOOTSTRAP_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcLogger.h") set(BOOTSTRAP_LIB_INC_LIST ${BOOTSTRAP_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcHandler.h") message("BOOTSTRAP_LIB_INC_LIST=${BOOTSTRAP_LIB_INC_LIST}") diff --git a/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc b/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc index 96a5c5cd85..7d066fd558 100644 --- a/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc +++ b/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc @@ -24,6 +24,7 @@ THE SOFTWARE. #include #include "rdc/rdc.h" #include "rdc_lib/RdcHandler.h" +#include "rdc_lib/RdcLogger.h" #include "rdc_lib/rdc_common.h" static void* libHandler = nullptr; @@ -96,7 +97,7 @@ rdc_status_t rdc_start_embedded(rdc_operation_mode_t op_mode, if (!libHandler) { error = dlerror(); - LOG_DEBUG("Fail to open librdc.so: " << error); + RDC_LOG(RDC_ERROR, "Fail to open librdc.so: " << error); return RDC_ST_FAIL_LOAD_MODULE; } @@ -104,7 +105,8 @@ rdc_status_t rdc_start_embedded(rdc_operation_mode_t op_mode, dlsym(libHandler, "make_handler"); if (!func_make_handler) { error = dlerror(); - LOG_DEBUG("Fail to find function make_handler:" << error); + RDC_LOG(RDC_ERROR, + "Fail to find function make_handler:" << error); return RDC_ST_FAIL_LOAD_MODULE; } @@ -144,15 +146,32 @@ rdc_status_t rdc_job_get_stats(rdc_handle_t p_rdc_handle, char job_id[64] , } rdc_status_t rdc_job_start_stats(rdc_handle_t p_rdc_handle, - rdc_gpu_group_t groupId, char job_id[64], uint64_t update_freq, - double max_keep_age, uint32_t max_keep_samples ) { + rdc_gpu_group_t groupId, char job_id[64], + uint64_t update_freq) { if (!p_rdc_handle) { return RDC_ST_INVALID_HANDLER; } return static_cast(p_rdc_handle)-> - rdc_job_start_stats(groupId, job_id, update_freq, - max_keep_age, max_keep_samples); + rdc_job_start_stats(groupId, job_id, update_freq); +} + +rdc_status_t rdc_job_remove(rdc_handle_t p_rdc_handle, char job_id[64]) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_job_remove(job_id); +} + +rdc_status_t rdc_job_remove_all(rdc_handle_t p_rdc_handle) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_job_remove_all(); } @@ -344,6 +363,8 @@ const char* rdc_status_string(rdc_status_t result) { return "The max limit reached"; case RDC_ST_CONFLICT: return "Conflict with current state"; + case RDC_ST_ALREADY_EXIST: + return "The value already exists"; case RDC_ST_CLIENT_ERROR: return "RDC Client error"; default: diff --git a/projects/rdc/rdc_libs/bootstrap/src/RdcLogger.cc b/projects/rdc/rdc_libs/bootstrap/src/RdcLogger.cc new file mode 100644 index 0000000000..f5c28b88d2 --- /dev/null +++ b/projects/rdc/rdc_libs/bootstrap/src/RdcLogger.cc @@ -0,0 +1,78 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "rdc_lib/RdcLogger.h" +#include +#include +#include +#include +#include +#include // NOLINT +#include "rdc_lib/rdc_common.h" + +namespace amd { +namespace rdc { + +RdcLogger::RdcLogger(std::ostream& os): + os_(os) { + char* verbose = getenv("RDC_LOG"); + if (verbose == nullptr) { + log_level_ = RDC_ERROR; + } else if (strcmp(verbose, "DEBUG") == 0) { + log_level_ = RDC_DEBUG; + } else if (strcmp(verbose, "INFO") == 0) { + log_level_ = RDC_INFO; + } else { + log_level_ = RDC_ERROR; + } +} + +std::string RdcLogger::get_log_header(uint32_t severity, + const char* file, int line) { + std::stringstream strstream; + auto ms = std::chrono::duration_cast + (std::chrono::system_clock::now().time_since_epoch()).count(); + strstream << std::fixed << std::setprecision(3) << (ms/1000.0) << " "; + if (severity == RDC_DEBUG) { + strstream << "DEBUG "; + } else if (severity == RDC_INFO) { + strstream << "INFO "; + } else { + strstream << "ERROR "; + } + + // extract out the file path as it may be very long. + if (file != nullptr) { + std::string file_str(file); + auto found = file_str.find_last_of("/"); + if (found != std::string::npos) { + file_str = file_str.substr(found+1); + } + strstream << file_str << "(" << line << "): "; + } + + return strstream.str(); +} + + +} // namespace rdc +} // namespace amd + diff --git a/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc index 9834fa4725..57523ae600 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc @@ -22,6 +22,8 @@ THE SOFTWARE. #include "rdc_lib/impl/RdcCacheManagerImpl.h" #include #include +#include +#include "rdc_lib/RdcLogger.h" #include "rdc_lib/rdc_common.h" @@ -127,15 +129,26 @@ rdc_status_t RdcCacheManagerImpl::rdc_field_get_latest_value( return RDC_ST_OK; } -uint32_t RdcCacheManagerImpl::get_cache_size() { - uint32_t cache_size = 0; +std::string RdcCacheManagerImpl::get_cache_stats() { + std::stringstream strstream; std::lock_guard guard(cache_mutex_); + strstream << "Cache samples:"; auto cache_samples_ite = cache_samples_.begin(); for (; cache_samples_ite != cache_samples_.end(); cache_samples_ite++) { - cache_size+=cache_samples_ite->second.size(); + strstream << "<" << cache_samples_ite->first.first << "," + << cache_samples_ite->first.second << ":" + << cache_samples_ite->second.size() << "> "; } - return cache_size; + + strstream <<" Job caches:"; + auto job_ite = cache_jobs_.begin(); + for ( ; job_ite != cache_jobs_.end(); job_ite++ ) { + strstream << "<" << job_ite->first << ":" + << job_ite->second.gpu_stats.size() << "> "; + } + + return strstream.str(); } rdc_status_t RdcCacheManagerImpl::rdc_update_cache(uint32_t gpu_index, @@ -162,5 +175,188 @@ rdc_status_t RdcCacheManagerImpl::rdc_update_cache(uint32_t gpu_index, return RDC_ST_OK; } +rdc_status_t RdcCacheManagerImpl::rdc_job_remove(char job_id[64]) { + std::lock_guard guard(cache_mutex_); + cache_jobs_.erase(job_id); + return RDC_ST_OK; +} + +rdc_status_t RdcCacheManagerImpl::rdc_job_remove_all() { + std::lock_guard guard(cache_mutex_); + cache_jobs_.clear(); + return RDC_ST_OK; +} + +rdc_status_t RdcCacheManagerImpl::rdc_update_job_stats(uint32_t gpu_index, + const std::string& job_id, const rdc_field_value& value) { + std::lock_guard guard(cache_mutex_); + auto job_iter = cache_jobs_.find(job_id); + if (job_iter == cache_jobs_.end()) { + return RDC_ST_NOT_FOUND; + } + + auto gpu_iter = job_iter->second.gpu_stats.find(gpu_index); + if (gpu_iter == job_iter->second.gpu_stats.end()) { + return RDC_ST_NOT_FOUND; + } + + auto fsummary = gpu_iter->second.field_summaries.find(value.field_id); + if (fsummary == gpu_iter->second.field_summaries.end()) { + return RDC_ST_NOT_FOUND; + } + if (fsummary->second.count == 0) { // first item + fsummary->second.count = 1; + fsummary->second.max_value = value.value.l_int; + fsummary->second.min_value = value.value.l_int; + fsummary->second.total_value = value.value.l_int; + fsummary->second.last_time = value.ts; + if (value.field_id == RDC_FI_POWER_USAGE) { + gpu_iter->second.energy_last_time = value.ts; + } + return RDC_ST_OK; + } + if (value.field_id == RDC_FI_POWER_USAGE) { + uint64_t time_elapsed = value.ts - gpu_iter->second.energy_last_time; + // Stored in cache as microseconds and microwats + gpu_iter->second.energy_consumed += + (time_elapsed * value.value.l_int)/(1000.0*1000000); + } + fsummary->second.max_value = std::max(fsummary->second.max_value, + static_cast(value.value.l_int)); + fsummary->second.min_value = std::min(fsummary->second.min_value, + static_cast(value.value.l_int)); + fsummary->second.total_value += value.value.l_int; + fsummary->second.last_time = value.ts; + fsummary->second.count++; + + return RDC_ST_OK; +} + +void RdcCacheManagerImpl::set_summary(const FieldSummaryStats & stats, + rdc_stats_summary_t & gpu, rdc_stats_summary_t& summary, + unsigned int adjuster) { + if (stats.count == 0) return; + + gpu.max_value = stats.max_value / adjuster; + gpu.min_value = stats.min_value / adjuster; + gpu.average = stats.total_value / stats.count / adjuster; + summary.max_value = std::max(summary.max_value, gpu.max_value); + summary.min_value = std::min(summary.min_value, gpu.min_value); + //< save total for future average calculation. + summary.average += gpu.average; +} + +rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(char jobId[64], + const rdc_gpu_total_memory_t& total_memory, + rdc_job_info_t* p_job_info) { + std::lock_guard guard(cache_mutex_); + auto job_stats = cache_jobs_.find(jobId); + + if (job_stats == cache_jobs_.end()) { + return RDC_ST_NOT_FOUND; + } + + //< Init the summary info + RDC_LOG(RDC_DEBUG, "rdc_job_get_stats for job " <summary; + summary_info.start_time = job_stats->second.start_time; + if (job_stats->second.end_time == 0) { + summary_info.end_time = time(nullptr); + } else { + summary_info.end_time = job_stats->second.end_time; + } + summary_info.energy_consumed = 0; + summary_info.max_gpu_memory_used = 0; + summary_info.power_usage = {0, std::numeric_limits::max(), 0}; + summary_info.gpu_clock = {0, std::numeric_limits::max(), 0}; + summary_info.gpu_utilization = {0, std::numeric_limits::max(), 0}; + summary_info.memory_utilization = {0, + std::numeric_limits::max(), 0}; + + p_job_info->num_gpus = job_stats->second.gpu_stats.size(); + + //< Populate information for each GPUs + + auto gpus = job_stats->second.gpu_stats.begin(); + for (; gpus != job_stats->second.gpu_stats.end(); gpus++) { + auto & gpu_info = p_job_info->gpus[gpus->first]; + gpu_info.start_time = summary_info.start_time; + gpu_info.end_time = summary_info.end_time; + gpu_info.energy_consumed = gpus->second.energy_consumed; + summary_info.energy_consumed += gpu_info.energy_consumed; + + auto ite = gpus->second.field_summaries.begin(); + for (; ite != gpus->second.field_summaries.end(); ite++) { + if (ite->first == RDC_FI_POWER_USAGE) { + set_summary(ite->second, + gpu_info.power_usage, summary_info.power_usage, 1000000); + } else if (ite->first == RDC_FI_GPU_MEMORY_USAGE) { + auto tmemory = total_memory.at(gpus->first); + set_summary(ite->second, gpu_info.memory_utilization, + summary_info.memory_utilization, tmemory/100); + gpu_info.max_gpu_memory_used = ite->second.max_value; + summary_info.max_gpu_memory_used = std::max( + summary_info.max_gpu_memory_used, + gpu_info.max_gpu_memory_used); + } else if (ite->first == RDC_FI_GPU_SM_CLOCK) { + set_summary(ite->second, gpu_info.gpu_clock, + summary_info.gpu_clock, 1000000); + } else if (ite->first == RDC_FI_GPU_UTIL) { + set_summary(ite->second, gpu_info.gpu_utilization, + summary_info.gpu_utilization, 1); + } + } + } + // Get the average of the summary + summary_info.power_usage.average = summary_info.power_usage.average/ + p_job_info->num_gpus; + summary_info.gpu_clock.average = summary_info.gpu_clock.average/ + p_job_info->num_gpus; + summary_info.gpu_utilization.average = summary_info.gpu_utilization.average/ + p_job_info->num_gpus; + summary_info.memory_utilization.average = + summary_info.memory_utilization.average/p_job_info->num_gpus; + + return RDC_ST_OK; +} + +rdc_status_t RdcCacheManagerImpl::rdc_job_start_stats(char job_id[64], + const rdc_group_info_t& ginfo, const rdc_field_group_info_t& finfo) { + RdcJobStatsCacheEntry cacheEntry; + cacheEntry.start_time = std::time(nullptr); + cacheEntry.end_time = 0; + for (unsigned int i=0 ; i < ginfo.count; i++) { // GPUs + GpuSummaryStats gstats; + gstats.energy_consumed = 0; + gstats.energy_last_time = 0; + for (unsigned int j = 0; j < finfo.count; j++) { // init fields + FieldSummaryStats s; + s.count = 0; + s.max_value = s.min_value = s.total_value = 0; + gstats.field_summaries.insert({finfo.field_ids[j], s}); + } + + cacheEntry.gpu_stats.insert({ginfo.entity_ids[i], gstats}); + } + + std::lock_guard guard(cache_mutex_); + cache_jobs_.insert({job_id, cacheEntry}); + return RDC_ST_OK; +} + + +rdc_status_t RdcCacheManagerImpl::rdc_job_stop_stats(char job_id[64]) { + std::lock_guard guard(cache_mutex_); + auto job_stats = cache_jobs_.find(job_id); + + if (job_stats == cache_jobs_.end()) { + return RDC_ST_NOT_FOUND; + } + + job_stats->second.end_time = std::time(nullptr); + + return RDC_ST_OK; +} + } // namespace rdc } // namespace amd diff --git a/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc b/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc index 8540fccfc7..94313d3610 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc @@ -27,6 +27,7 @@ THE SOFTWARE. #include "rdc_lib/impl/RdcCacheManagerImpl.h" #include "rdc_lib/impl/RdcWatchTableImpl.h" #include "rdc_lib/rdc_common.h" +#include "rdc_lib/RdcLogger.h" #include "rdc_lib/RdcException.h" #include "rocm_smi/rocm_smi.h" @@ -73,6 +74,7 @@ RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode): , metrics_updater_(new RdcMetricsUpdaterImpl(watch_table_, METIC_UPDATE_FREQUENCY)) { if (mode == RDC_OPERATION_MODE_AUTO) { + RDC_LOG(RDC_DEBUG, "Run RDC with RDC_OPERATION_MODE_AUTO"); metrics_updater_->start(); } } @@ -83,32 +85,49 @@ RdcEmbeddedHandler::~RdcEmbeddedHandler() { // JOB API rdc_status_t RdcEmbeddedHandler::rdc_job_start_stats(rdc_gpu_group_t groupId, - char job_id[64], uint64_t update_freq, double max_keep_age, - uint32_t max_keep_samples) { - // TODO(bill_liu): implement - (void)(groupId); - (void)(job_id); - (void)(update_freq); - (void)(max_keep_age); - (void)(max_keep_samples); - - return RDC_ST_OK; + char job_id[64], uint64_t update_freq) { + return watch_table_->rdc_job_start_stats(groupId, job_id, update_freq); } -rdc_status_t RdcEmbeddedHandler::rdc_job_get_stats(char job_id[64], +rdc_status_t RdcEmbeddedHandler::rdc_job_get_stats(char job_id[64], rdc_job_info_t* p_job_info) { - // TODO(bill_liu): implement - (void)(job_id); - (void)(p_job_info); - return RDC_ST_OK; + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES]; + uint32_t count = 0; + rdc_status_t status = rdc_device_get_all( + gpu_index_list, &count); + if (status != RDC_ST_OK) { + return status; + } + + rdc_gpu_total_memory_t all_total_memory; + + for (uint32_t i = 0; i < count ; i++) { + rdc_field_value total_memory; + status = metric_fetcher_->fetch_smi_field(gpu_index_list[i], + RDC_FI_GPU_MEMORY_TOTAL, &total_memory); + if (status != RDC_ST_OK) { + RDC_LOG(RDC_ERROR, "Fail to get total memory of GPU " + << gpu_index_list[i]); + return status; + } + all_total_memory.insert({gpu_index_list[i], total_memory.value.l_int}); + } + + return cache_mgr_->rdc_job_get_stats(job_id, all_total_memory, p_job_info); } -rdc_status_t RdcEmbeddedHandler::rdc_job_stop_stats(char job_id[64] ) { - // TODO(bill_liu): implement - (void)(job_id); - return RDC_ST_OK; +rdc_status_t RdcEmbeddedHandler::rdc_job_stop_stats(char job_id[64]) { + return watch_table_->rdc_job_stop_stats(job_id); } +rdc_status_t RdcEmbeddedHandler::rdc_job_remove(char job_id[64]) { + return watch_table_->rdc_job_remove(job_id); +} + + +rdc_status_t RdcEmbeddedHandler::rdc_job_remove_all() { + return watch_table_->rdc_job_remove_all(); +} // Discovery API rdc_status_t RdcEmbeddedHandler::rdc_device_get_all( @@ -194,6 +213,8 @@ rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_add(rdc_gpu_group_t group_id, } if (!is_gpu_exist) { + RDC_LOG(RDC_INFO, "Fail to add GPU index " << gpu_index << " to group " + << group_id <<" as the GPU index is invalid."); return RDC_ST_NOT_FOUND; } @@ -211,6 +232,9 @@ rdc_status_t RdcEmbeddedHandler::rdc_group_field_create(uint32_t num_field_ids, if (num_field_ids <= RDC_MAX_FIELD_IDS_PER_FIELD_GROUP) { for (uint32_t i = 0; i < num_field_ids; i++) { if (!metric_fetcher_->is_field_valid(field_ids[i])) { + RDC_LOG(RDC_INFO, + "Fail to create field group with unknown field id " + << field_ids[i]); return RDC_ST_NOT_SUPPORTED; } } @@ -285,6 +309,9 @@ rdc_status_t RdcEmbeddedHandler::rdc_field_get_latest_value( return RDC_ST_BAD_PARAMETER; } if (!metric_fetcher_->is_field_valid(field)) { + RDC_LOG(RDC_INFO, + "Fail to get latest value with unknown field id " + << field); return RDC_ST_NOT_SUPPORTED; } return cache_mgr_->rdc_field_get_latest_value(gpu_index, field, value); @@ -297,6 +324,9 @@ rdc_status_t RdcEmbeddedHandler::rdc_field_get_value_since(uint32_t gpu_index, return RDC_ST_BAD_PARAMETER; } if (!metric_fetcher_->is_field_valid(field)) { + RDC_LOG(RDC_INFO, + "Fail to get value since with unknown field id " + << field); return RDC_ST_NOT_SUPPORTED; } return cache_mgr_->rdc_field_get_value_since(gpu_index, field, diff --git a/projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc index 21d57f4136..eb89ddb505 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc @@ -22,11 +22,20 @@ THE SOFTWARE. #include "rdc_lib/impl/RdcGroupSettingsImpl.h" #include #include "rdc_lib/rdc_common.h" +#include "rdc_lib/RdcLogger.h" namespace amd { namespace rdc { RdcGroupSettingsImpl::RdcGroupSettingsImpl() { + // Add the default job stats fields + uint32_t job_fields[] = {RDC_FI_GPU_MEMORY_USAGE, + RDC_FI_POWER_USAGE, RDC_FI_GPU_SM_CLOCK, RDC_FI_GPU_UTIL}; + char job_field_group[] = "JobStatsFields"; + rdc_field_grp_t fgid = JOB_FIELD_ID; + + rdc_group_field_create(sizeof(job_fields)/sizeof(uint32_t), + job_fields, job_field_group, &fgid); } rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_create( @@ -62,6 +71,8 @@ rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_add( // Check whether the index already exists for (uint32_t i=0; i < ite->second.count; i++) { if (ite->second.entity_ids[i] == gpu_index) { + RDC_LOG(RDC_INFO, "Fail to add " << gpu_index + <<" to GPU group " << groupId << " as it is already exists"); return RDC_ST_BAD_PARAMETER; } } @@ -136,15 +147,19 @@ rdc_status_t RdcGroupSettingsImpl::rdc_group_field_create( if (field_group_.size() >= RDC_MAX_NUM_FIELD_GROUPS) { return RDC_ST_MAX_LIMIT; } - field_group_.emplace(cur_filed_group_id_, finfo); - *rdc_field_group_id = cur_filed_group_id_; - cur_filed_group_id_++; + field_group_.emplace(cur_field_group_id_, finfo); + *rdc_field_group_id = cur_field_group_id_; + cur_field_group_id_++; return RDC_ST_OK; } rdc_status_t RdcGroupSettingsImpl::rdc_group_field_destroy( rdc_field_grp_t rdc_field_group_id) { + if (rdc_field_group_id == JOB_FIELD_ID) { + RDC_LOG(RDC_INFO, "Cannot delete system JOB_FIELD_ID field group"); + return RDC_ST_BAD_PARAMETER; + } std::lock_guard guard(field_group_mutex_); field_group_.erase(rdc_field_group_id); return RDC_ST_OK; @@ -183,6 +198,10 @@ rdc_status_t RdcGroupSettingsImpl::rdc_group_field_get_all_ids( if (*count >= RDC_MAX_NUM_FIELD_GROUPS) { return RDC_ST_MAX_LIMIT; } + + // Skip system defined JOB_FIELD_ID + if (ite->first == JOB_FIELD_ID) continue; + field_group_id_list[*count] = ite->first; (*count)++; } diff --git a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index 35ad6ed32f..114510b8e5 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -22,10 +22,11 @@ THE SOFTWARE. #include "rdc_lib/impl/RdcMetricFetcherImpl.h" #include #include -#include +#include //NOLINT #include #include #include "rdc_lib/rdc_common.h" +#include "rdc_lib/RdcLogger.h" #include "rocm_smi/rocm_smi.h" namespace amd { @@ -48,6 +49,8 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, uint64_t i64 = 0; if (!is_field_valid(field_id)) { + RDC_LOG(RDC_ERROR, "Fail to fetch field " << field_id + << " which is not supported"); return RDC_ST_NOT_SUPPORTED; } @@ -125,6 +128,27 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, break; } + gettimeofday(&tv, NULL); + int64_t latency = static_cast(tv.tv_sec)*1000+tv.tv_usec/1000 + - value->ts; + if (value->status != RSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "Fail to fetch " << gpu_index << ":" << + field_id_string(field_id) << " with rsmi error code " + << value->status <<", latency " << latency); + } else if (value->type == INTEGER) { + RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" << + field_id_string(field_id) << ":" << value->value.l_int + << ", latency " << latency); + } else if (value->type == DOUBLE) { + RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" << + field_id_string(field_id) << ":" << value->value.dbl + << ", latency " << latency); + } else if (value->type == STRING) { + RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" << + field_id_string(field_id) << ":" << value->value.str + << ", latency " << latency); + } + return value->status == RSMI_STATUS_SUCCESS ? RDC_ST_OK : RDC_ST_MSI_ERROR; } diff --git a/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc index 979e5eb536..47bc11fb4c 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc @@ -23,8 +23,10 @@ THE SOFTWARE. #include "rdc_lib/impl/RdcWatchTableImpl.h" #include #include +#include #include #include "rdc_lib/rdc_common.h" +#include "rdc_lib/RdcLogger.h" namespace amd { namespace rdc { @@ -39,24 +41,92 @@ RdcWatchTableImpl::RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings, } rdc_status_t RdcWatchTableImpl::rdc_job_start_stats(rdc_gpu_group_t group_id, - char job_id[64]) { - // TODO(bill_liu): implement - (void)(group_id); - (void)(job_id); - return RDC_ST_OK; + char job_id[64], uint64_t update_freq) { + do { //< lock guard for thread safe + std::lock_guard guard(watch_mutex_); + if (job_watch_table_.find(job_id) != job_watch_table_.end()) { + return RDC_ST_ALREADY_EXIST; + } + } while (0); + + std::vector fields_in_watch; + rdc_status_t result = get_fields_from_group(group_id, + JOB_FIELD_ID, fields_in_watch); + JobWatchTableEntry jentry {group_id, fields_in_watch}; + job_watch_table_.insert({job_id, jentry}); + + result = rdc_field_watch(group_id, JOB_FIELD_ID, update_freq, 0, 0); + if (result != RDC_ST_OK) { + return result; + } + + rdc_field_group_info_t finfo; + rdc_group_info_t ginfo; + result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo); + if (result != RDC_ST_OK) { + return result; + } + result = group_settings_->rdc_group_field_get_info(JOB_FIELD_ID, &finfo); + if (result != RDC_ST_OK) { + return result; + } + + result = cache_mgr_->rdc_job_start_stats(job_id, ginfo, finfo); + + return result; } -rdc_status_t RdcWatchTableImpl::rdc_watch_job_fields(rdc_gpu_group_t group_id, - uint64_t update_freq, double max_keep_age, - uint32_t max_keep_samples) { - // TODO(bill_liu): implement - (void)(group_id); - (void)(update_freq); - (void)(max_keep_age); - (void)(max_keep_samples); - return RDC_ST_OK; +rdc_status_t RdcWatchTableImpl::rdc_job_stop_stats(char job_id[64]) { + uint32_t job_group_id; + do { //< lock guard for thread safe + std::lock_guard guard(watch_mutex_); + auto job = job_watch_table_.find(job_id); + if (job == job_watch_table_.end()) { + return RDC_ST_NOT_FOUND; + } + job_group_id = job->second.group_id; + } while (0); + + rdc_status_t result = rdc_field_unwatch(job_group_id, JOB_FIELD_ID); + if (result != RDC_ST_OK) { + return result; + } + + do { //< lock guard for thread safe + std::lock_guard guard(watch_mutex_); + job_watch_table_.erase(job_id); + } while (0); + + result = cache_mgr_->rdc_job_stop_stats(job_id); + + return result; } +rdc_status_t RdcWatchTableImpl::rdc_job_remove(char job_id[64]) { + rdc_job_stop_stats(job_id); + return cache_mgr_->rdc_job_remove(job_id); +} + +rdc_status_t RdcWatchTableImpl::rdc_job_remove_all() { + // Get all the job ids; + std::vector v; + do { //< lock guard for thread safe + std::lock_guard guard(watch_mutex_); + for (auto ite = job_watch_table_.begin(); + ite != job_watch_table_.end(); ite++) { + v.push_back(ite->first); + } + } while (0); + + // Stop them + for (auto job = v.begin(); job != v.end(); job++) { + rdc_job_stop_stats(const_cast(job->c_str())); + } + + return cache_mgr_->rdc_job_remove_all(); +} + + rdc_status_t RdcWatchTableImpl::get_fields_from_group(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id, std::vector & fields) { rdc_field_group_info_t finfo; @@ -228,6 +298,21 @@ rdc_status_t RdcWatchTableImpl::rdc_field_unwatch( return update_field_in_table_when_unwatch(ite->first); } +bool RdcWatchTableImpl::is_job_watch_field(uint32_t gpu_index, + uint32_t field_id, std::string& job_id) const { + RdcFieldKey key{gpu_index, field_id}; + + for (auto ite = job_watch_table_.begin(); + ite != job_watch_table_.end(); ite++) { + auto& fields = ite->second.fields; + if (std::find(fields.begin(), fields.end(), key) != fields.end()) { + job_id = ite->first; + return true; + } + } + + return false; +} rdc_status_t RdcWatchTableImpl::rdc_field_update_all() { uint32_t items_fetched = 0; @@ -251,13 +336,19 @@ rdc_status_t RdcWatchTableImpl::rdc_field_update_all() { result = metric_fetcher_->fetch_smi_field( fite->first.first, fite->first.second, &value); if (result != RDC_ST_OK) { - LOG_DEBUG("Fail to fetch the field: " << rdc_status_string(result)); continue; } // Update the cache cache_mgr_->rdc_update_cache(fite->first.first, value); + // Update the job stats cache + std::string job_id; + if (is_job_watch_field(fite->first.first, fite->first.second, job_id)) { + cache_mgr_->rdc_update_job_stats(fite->first.first, job_id, value); + } + + // Update the last_upate_time gettimeofday(&tv, NULL); now = static_cast(tv.tv_sec)*1000+tv.tv_usec/1000; @@ -303,6 +394,56 @@ void RdcWatchTableImpl::clean_up() { ++wite; } } + + // Debug log every 30 seconds + if (now/1000%30 == 0) { + debug_status(); + } +} + +void RdcWatchTableImpl::debug_status() { + RDC_LOG(RDC_DEBUG, "fields_to_watch_:" << fields_to_watch_.size() + << " watch_table_:" << watch_table_.size() + << " job_watch_table_:" << job_watch_table_.size() + << " cache stats:" << cache_mgr_->get_cache_stats()); + + if (watch_table_.size() > 0) { + RDC_LOG(RDC_DEBUG, "watch table details:"); + } + for (auto wite = watch_table_.begin(); wite != watch_table_.end(); wite++) { + RDC_LOG(RDC_DEBUG, wite->first.first << "," << wite->first.second + << ": age:" << wite->second.max_keep_age << ", samples:" + << wite->second.max_keep_samples << ", is_watching:" + << wite->second.is_watching << ", last_update_time:" + << wite->second.last_update_time <<", update_freq:" + << wite->second.update_freq); + } + + if (job_watch_table_.size() > 0) { + RDC_LOG(RDC_DEBUG, "job watch table details: "); + } + for (auto jite = job_watch_table_.begin(); + jite !=job_watch_table_.end(); jite++) { + std::stringstream strstream; + for (const auto& p : jite->second.fields) { + strstream << "<" << p.first << "," << p.second << "> "; + } + RDC_LOG(RDC_DEBUG, jite->first << ": " << jite->second.group_id + << " fields : "<< strstream.str()); + } + + if (fields_to_watch_.size() > 0) { + RDC_LOG(RDC_DEBUG, "fields to watch details:"); + } + for (auto fite = fields_to_watch_.begin(); fite != fields_to_watch_.end(); + fite++) { + RDC_LOG(RDC_DEBUG, fite->first.first << "," << fite->first.second + << ": age:" << fite->second.max_keep_age << ", samples:" + << fite->second.max_keep_samples << ", is_watching:" + << fite->second.is_watching << ", last_update_time:" + << fite->second.last_update_time <<", update_freq:" + << fite->second.update_freq); + } } } // namespace rdc diff --git a/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc b/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc index 810cfb8c99..3c7088e3a8 100644 --- a/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc +++ b/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc @@ -62,19 +62,16 @@ rdc_status_t RdcStandaloneHandler::error_handle(::grpc::Status status, // JOB RdcAPI rdc_status_t RdcStandaloneHandler::rdc_job_start_stats(rdc_gpu_group_t groupId, - char job_id[64], uint64_t update_freq, double max_keep_age, - uint32_t max_keep_samples) { + char job_id[64], uint64_t update_freq) { // TODO(bill_liu): implement (void)(groupId); (void)(job_id); (void)(update_freq); - (void)(max_keep_age); - (void)(max_keep_samples); return RDC_ST_OK; } -rdc_status_t RdcStandaloneHandler::rdc_job_get_stats(char job_id[64], +rdc_status_t RdcStandaloneHandler::rdc_job_get_stats(char job_id[64], rdc_job_info_t* p_job_info) { // TODO(bill_liu): implement (void)(job_id); @@ -82,13 +79,25 @@ rdc_status_t RdcStandaloneHandler::rdc_job_get_stats(char job_id[64], return RDC_ST_OK; } -rdc_status_t RdcStandaloneHandler::rdc_job_stop_stats(char job_id[64] ) { +rdc_status_t RdcStandaloneHandler::rdc_job_stop_stats(char job_id[64]) { // TODO(bill_liu): implement (void)(job_id); return RDC_ST_OK; } +rdc_status_t RdcStandaloneHandler::rdc_job_remove(char job_id[64]) { + // TODO(bill_liu): implement + (void)(job_id); + return RDC_ST_OK; +} + +rdc_status_t RdcStandaloneHandler::rdc_job_remove_all() { + // TODO(bill_liu): implement + return RDC_ST_OK; +} + + // Discovery RdcAPI rdc_status_t RdcStandaloneHandler::rdc_device_get_all( uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) { diff --git a/projects/rdc/rdci/src/RdciDmonSubSystem.cc b/projects/rdc/rdci/src/RdciDmonSubSystem.cc index f073ba65c2..c1618d785d 100644 --- a/projects/rdc/rdci/src/RdciDmonSubSystem.cc +++ b/projects/rdc/rdci/src/RdciDmonSubSystem.cc @@ -58,6 +58,7 @@ void RdciDmonSubSystem::parse_cmd_opts(int argc, char ** argv) { const struct option long_options[] = { {"host", required_argument, nullptr, HOST_OPTIONS }, {"help", optional_argument, nullptr, 'h' }, + {"unauth", optional_argument, nullptr, 'u' }, {"list", optional_argument, nullptr, 'l' }, {"field-group-id", required_argument, nullptr, 'f' }, {"field-id", required_argument, nullptr, 'e' }, @@ -73,7 +74,7 @@ void RdciDmonSubSystem::parse_cmd_opts(int argc, char ** argv) { std::string gpu_indexes; std::string field_ids; - while ((opt = getopt_long(argc, argv, "hlf:g:c:d:e:i:", + while ((opt = getopt_long(argc, argv, "hluf:g:c:d:e:i:", long_options, &option_index)) != -1) { switch (opt) { case HOST_OPTIONS: @@ -82,9 +83,12 @@ void RdciDmonSubSystem::parse_cmd_opts(int argc, char ** argv) { case 'h': dmon_ops_ = DMON_HELP; return; + case 'u': + use_auth_ = false; + break; case 'l': dmon_ops_ = DMON_LIST_FIELDS; - return; + break; case 'f': if (!IsNumber(optarg)) { show_help(); @@ -130,6 +134,10 @@ void RdciDmonSubSystem::parse_cmd_opts(int argc, char ** argv) { } } + if (dmon_ops_ == DMON_LIST_FIELDS) { + return; + } + if (options_.find(OPTIONS_FIELD_GROUP_ID) == options_.end()) { if (field_ids == "") { show_help(); diff --git a/projects/rdc/rdci/src/RdciFieldGroupSubSystem.cc b/projects/rdc/rdci/src/RdciFieldGroupSubSystem.cc index 4d9b07d6c7..4961cd6698 100644 --- a/projects/rdc/rdci/src/RdciFieldGroupSubSystem.cc +++ b/projects/rdc/rdci/src/RdciFieldGroupSubSystem.cc @@ -40,6 +40,7 @@ void RdciFieldGroupSubSystem::parse_cmd_opts(int argc, char ** argv) { const struct option long_options[] = { {"host", required_argument, nullptr, HOST_OPTIONS }, {"help", optional_argument, nullptr, 'h' }, + {"unauth", optional_argument, nullptr, 'u' }, {"list", optional_argument, nullptr, 'l' }, {"group", required_argument, nullptr, 'g'}, {"create", required_argument, nullptr, 'c' }, @@ -52,7 +53,7 @@ void RdciFieldGroupSubSystem::parse_cmd_opts(int argc, char ** argv) { int option_index = 0; int opt = 0; - while ((opt = getopt_long(argc, argv, "hlif:c:g:d:", + while ((opt = getopt_long(argc, argv, "hluif:c:g:d:", long_options, &option_index)) != -1) { switch (opt) { case HOST_OPTIONS: @@ -61,6 +62,9 @@ void RdciFieldGroupSubSystem::parse_cmd_opts(int argc, char ** argv) { case 'h': field_group_ops_ = FIELD_GROUP_HELP; return; + case 'u': + use_auth_ = false; + break; case 'l': field_group_ops_ = FIELD_GROUP_LIST; break; diff --git a/projects/rdc/rdci/src/RdciGroupSubSystem.cc b/projects/rdc/rdci/src/RdciGroupSubSystem.cc index e8f58879cb..c105333217 100644 --- a/projects/rdc/rdci/src/RdciGroupSubSystem.cc +++ b/projects/rdc/rdci/src/RdciGroupSubSystem.cc @@ -40,6 +40,7 @@ void RdciGroupSubSystem::parse_cmd_opts(int argc, char ** argv) { const struct option long_options[] = { {"host", required_argument, nullptr, HOST_OPTIONS }, {"help", optional_argument, nullptr, 'h' }, + {"unauth", optional_argument, nullptr, 'u' }, {"list", optional_argument, nullptr, 'l' }, {"group", required_argument, nullptr, 'g'}, {"create", required_argument, nullptr, 'c' }, @@ -52,7 +53,7 @@ void RdciGroupSubSystem::parse_cmd_opts(int argc, char ** argv) { int option_index = 0; int opt = 0; - while ((opt = getopt_long(argc, argv, "hlic:g:a:d:", + while ((opt = getopt_long(argc, argv, "hluic:g:a:d:", long_options, &option_index)) != -1) { switch (opt) { case HOST_OPTIONS: @@ -61,6 +62,9 @@ void RdciGroupSubSystem::parse_cmd_opts(int argc, char ** argv) { case 'h': group_ops_ = GROUP_HELP; return; + case 'u': + use_auth_ = false; + break; case 'l': group_ops_ = GROUP_LIST; break;