diff --git a/projects/rdc/README.md b/projects/rdc/README.md index b145a5d1d7..2126779f73 100644 --- a/projects/rdc/README.md +++ b/projects/rdc/README.md @@ -16,9 +16,16 @@ LD_LIBRARY_PATH=$PWD/rdc_libs/ ./rdci/rdci discovery -l ``` ## Troubleshooting +Enable the debug log: +``` +sudo RDC_LOG=DEBUG ./server/rdcd +``` + Check the ssl connection in rdci: ``` rdcd_hostname= "" # Set the rdcd you want to connect openssl s_client -connect $rdcd_hostname:50051 -cert /etc/rdc/client/certs/rdc_client_cert.pem -key /etc/rdc/client/private/rdc_client_cert.key -CAfile /etc/rdc/client/certs/rdc_cacert.pem ``` + + diff --git a/projects/rdc/example/job_stats_example.cc b/projects/rdc/example/job_stats_example.cc index 11c99ea127..2d9493f9d7 100644 --- a/projects/rdc/example/job_stats_example.cc +++ b/projects/rdc/example/job_stats_example.cc @@ -92,11 +92,9 @@ int main(int, char **) { goto cleanup; } - // (2) start the recording. Set the sample frequency to once per second, the - // max keep age to one hour and the maximum number of samples to - // keep to unlimited. + // (2) start the recording. Set the sample frequency to once per second. result = rdc_job_start_stats(rdc_handle, group_id, - job_id, 1000000, 3600, 0); + job_id, 1000000); if (result != RDC_ST_OK) { std::cout << "Error start job stats. Return: " << rdc_status_string(result); diff --git a/projects/rdc/include/rdc/rdc.h b/projects/rdc/include/rdc/rdc.h index e1674a4e22..a1066a2666 100755 --- a/projects/rdc/include/rdc/rdc.h +++ b/projects/rdc/include/rdc/rdc.h @@ -52,6 +52,7 @@ typedef enum { RDC_ST_NOT_FOUND, //!< Cannot find the value RDC_ST_CONFLICT, //!< Conflict with current state RDC_ST_CLIENT_ERROR, //!< The RDC client error + RDC_ST_ALREADY_EXIST, //!< The item already exists RDC_ST_MAX_LIMIT //!< Max limit recording for the object } rdc_status_t; @@ -371,15 +372,10 @@ rdc_status_t rdc_disconnect(rdc_handle_t p_rdc_handle); * * @param[in] update_freq How often to update this field in usec. * - * @param[in] max_keep_age How long to keep data for this field in seconds. - * - * @param[in] max_keep_samples Maximum number of samples to keep. 0=no limit. - * * @retval ::RDC_ST_OK is returned upon successful call. */ rdc_status_t rdc_job_start_stats(rdc_handle_t p_rdc_handle, - rdc_gpu_group_t group_id, char job_id[64], uint64_t update_freq, - double max_keep_age, uint32_t max_keep_samples); + rdc_gpu_group_t group_id, char job_id[64], uint64_t update_freq); /** * @brief Get the stats of the job using the job id. @@ -415,6 +411,35 @@ rdc_status_t rdc_job_get_stats(rdc_handle_t p_rdc_handle, char job_id[64], rdc_status_t rdc_job_stop_stats(rdc_handle_t p_rdc_handle, char job_id[64]); +/** + * @brief Request RDC to stop tracking the job given by job_id + * + * @details After this call, you will no longer be able to call + * rdc_job_get_stats() on this job_id. But you will be able to reuse + * the job_id after this call. + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] job_id The name of the job. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_job_remove(rdc_handle_t p_rdc_handle, + char job_id[64]); + +/** + * @brief Request RDC to stop tracking all the jobs + * + * @details After this call, you will no longer be able to call + * rdc_job_get_stats() on any job id. But you will be able to reuse + * the any previous used job id after this call. + * + * @param[in] p_rdc_handle The RDC handler. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_job_remove_all(rdc_handle_t p_rdc_handle); + /** * @brief Request RDC to update all fields to be watched. * diff --git a/projects/rdc/include/rdc_lib/RdcCacheManager.h b/projects/rdc/include/rdc_lib/RdcCacheManager.h index 9948dd2c5e..77eab78123 100644 --- a/projects/rdc/include/rdc_lib/RdcCacheManager.h +++ b/projects/rdc/include/rdc_lib/RdcCacheManager.h @@ -24,6 +24,7 @@ THE SOFTWARE. #include #include +#include #include #include #include "rdc_lib/rdc_common.h" @@ -31,6 +32,7 @@ THE SOFTWARE. namespace amd { namespace rdc { +typedef std::map rdc_gpu_total_memory_t; class RdcCacheManager { public: @@ -43,7 +45,19 @@ class RdcCacheManager { const rdc_field_value& value) = 0; virtual rdc_status_t evict_cache(uint32_t gpu_index, uint32_t field_id, uint64_t max_keep_samples, double max_keep_age) = 0; - virtual uint32_t get_cache_size() = 0; + virtual std::string get_cache_stats() = 0; + + virtual rdc_status_t rdc_job_get_stats(char jobId[64], + const rdc_gpu_total_memory_t& total_memory, + rdc_job_info_t* p_job_info) = 0; + virtual rdc_status_t rdc_job_start_stats(char jobId[64], + const rdc_group_info_t& group, + const rdc_field_group_info_t& finfo) = 0; + virtual rdc_status_t rdc_job_stop_stats(char job_id[64]) = 0; + virtual rdc_status_t rdc_update_job_stats(uint32_t gpu_index, + const std::string& job_id, const rdc_field_value& value) = 0; + virtual rdc_status_t rdc_job_remove(char job_id[64]) = 0; + virtual rdc_status_t rdc_job_remove_all() = 0; virtual ~RdcCacheManager() {} }; diff --git a/projects/rdc/include/rdc_lib/RdcGroupSettings.h b/projects/rdc/include/rdc_lib/RdcGroupSettings.h index 2dd0d321ee..89d6f4ed22 100644 --- a/projects/rdc/include/rdc_lib/RdcGroupSettings.h +++ b/projects/rdc/include/rdc_lib/RdcGroupSettings.h @@ -59,7 +59,7 @@ class RdcGroupSettings { }; typedef std::shared_ptr RdcGroupSettingsPtr; - +const uint32_t JOB_FIELD_ID = 0; } // namespace rdc } // namespace amd diff --git a/projects/rdc/include/rdc_lib/RdcHandler.h b/projects/rdc/include/rdc_lib/RdcHandler.h index 7c5d927641..c5071bafaa 100644 --- a/projects/rdc/include/rdc_lib/RdcHandler.h +++ b/projects/rdc/include/rdc_lib/RdcHandler.h @@ -33,12 +33,12 @@ class RdcHandler { public: // Job API virtual rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId, - char job_id[64], uint64_t update_freq, double max_keep_age, - uint32_t max_keep_samples) = 0; - virtual rdc_status_t rdc_job_get_stats(char jobId[64], + char job_id[64], uint64_t update_freq) = 0; + virtual rdc_status_t rdc_job_get_stats(char jobId[64], rdc_job_info_t* p_job_info)= 0; - virtual rdc_status_t rdc_job_stop_stats(char job_id[64]) = 0; - + virtual rdc_status_t rdc_job_stop_stats(char job_id[64]) = 0; + virtual rdc_status_t rdc_job_remove(char job_id[64]) = 0; + virtual rdc_status_t rdc_job_remove_all() = 0; // Discovery API virtual rdc_status_t rdc_device_get_all( diff --git a/projects/rdc/include/rdc_lib/RdcLogger.h b/projects/rdc/include/rdc_lib/RdcLogger.h new file mode 100644 index 0000000000..b2d891b73e --- /dev/null +++ b/projects/rdc/include/rdc_lib/RdcLogger.h @@ -0,0 +1,59 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef RDC_LIB_RDCLOGGER_H_ +#define RDC_LIB_RDCLOGGER_H_ +#include +#include +#include // NOLINT + +namespace amd { +namespace rdc { +class RdcLogger { + public: + explicit RdcLogger(std::ostream& os); + + static RdcLogger& getLogger() { + static RdcLogger logger(std::cout); + return logger; + } + + bool should_log(uint32_t severity) { + return log_level_ >= severity; + } + + std::ostream& get_ostream() { + return os_; + } + + std::string get_log_header(uint32_t severity, + const char* file, int line); + + private: + std::ostream& os_; + uint32_t log_level_; +}; + +} // namespace rdc +} // namespace amd + + +#endif // RDC_LIB_RDCLOGGER_H_ diff --git a/projects/rdc/include/rdc_lib/RdcWatchTable.h b/projects/rdc/include/rdc_lib/RdcWatchTable.h index 67cbe61785..34f01750b0 100644 --- a/projects/rdc/include/rdc_lib/RdcWatchTable.h +++ b/projects/rdc/include/rdc_lib/RdcWatchTable.h @@ -36,10 +36,10 @@ class RdcWatchTable { virtual rdc_status_t rdc_field_update_all() = 0; virtual rdc_status_t rdc_job_start_stats(rdc_gpu_group_t group_id, - char job_id[64]) = 0; - virtual rdc_status_t rdc_watch_job_fields(rdc_gpu_group_t group_id, - uint64_t update_freq, double max_keep_age, - uint32_t max_keep_samples) = 0; + char job_id[64], uint64_t update_freq) = 0; + virtual rdc_status_t rdc_job_stop_stats(char job_id[64]) = 0; + virtual rdc_status_t rdc_job_remove(char job_id[64]) = 0; + virtual rdc_status_t rdc_job_remove_all() = 0; virtual rdc_status_t rdc_field_watch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id, uint64_t update_freq, diff --git a/projects/rdc/include/rdc_lib/impl/RdcCacheManagerImpl.h b/projects/rdc/include/rdc_lib/impl/RdcCacheManagerImpl.h index 454554e4aa..952f376657 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcCacheManagerImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcCacheManagerImpl.h @@ -23,7 +23,8 @@ THE SOFTWARE. #define RDC_LIB_IMPL_RDCCACHEMANAGERIMPL_H_ #include -#include +#include // NOLINT(build/c++11) +#include #include #include #include "rdc_lib/RdcCacheManager.h" @@ -41,6 +42,29 @@ struct RdcCacheEntry { typedef std::map> RdcCacheSamples; +struct FieldSummaryStats { + int64_t max_value; + int64_t min_value; + int64_t total_value; + uint64_t last_time; + uint64_t count; +}; + +struct GpuSummaryStats { + uint64_t energy_consumed; + uint64_t energy_last_time; + std::map field_summaries; +}; + +// Per job entry +struct RdcJobStatsCacheEntry { + uint64_t start_time; + uint64_t end_time; + std::map gpu_stats; +}; + +// +typedef std::map RdcJobStatsCache; class RdcCacheManagerImpl: public RdcCacheManager { public: @@ -53,10 +77,27 @@ class RdcCacheManagerImpl: public RdcCacheManager { const rdc_field_value& value) override; rdc_status_t evict_cache(uint32_t gpu_index, uint32_t field_id, uint64_t max_keep_samples, double max_keep_age) override; - uint32_t get_cache_size() override; + std::string get_cache_stats() override; + + rdc_status_t rdc_job_get_stats(char job_id[64], + const rdc_gpu_total_memory_t& total_memory, + rdc_job_info_t* p_job_info) override; + rdc_status_t rdc_job_start_stats(char job_id[64], + const rdc_group_info_t& group, + const rdc_field_group_info_t& finfo) override; + rdc_status_t rdc_job_stop_stats(char job_id[64]) override; + rdc_status_t rdc_update_job_stats(uint32_t gpu_index, + const std::string& job_id, + const rdc_field_value& value) override; + rdc_status_t rdc_job_remove(char job_id[64]) override; + rdc_status_t rdc_job_remove_all() override; private: + void set_summary(const FieldSummaryStats & stats, + rdc_stats_summary_t& gpu, rdc_stats_summary_t& summary, // NOLINT + unsigned int adjuster); RdcCacheSamples cache_samples_; + RdcJobStatsCache cache_jobs_; std::mutex cache_mutex_; }; diff --git a/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h b/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h index 8c5c2cf008..59e642993a 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h +++ b/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h @@ -36,11 +36,12 @@ class RdcEmbeddedHandler: public RdcHandler { public: // Job API rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId, - char job_id[64], uint64_t update_freq, double max_keep_age, - uint32_t max_keep_samples) override; - rdc_status_t rdc_job_get_stats(char jobId[64], + char job_id[64], uint64_t update_freq) override; + rdc_status_t rdc_job_get_stats(char jobId[64], rdc_job_info_t* p_job_info) override; - rdc_status_t rdc_job_stop_stats(char job_id[64]) override; + rdc_status_t rdc_job_stop_stats(char job_id[64]) override; + rdc_status_t rdc_job_remove(char job_id[64]) override; + rdc_status_t rdc_job_remove_all() override; // Discovery API rdc_status_t rdc_device_get_all( diff --git a/projects/rdc/include/rdc_lib/impl/RdcGroupSettingsImpl.h b/projects/rdc/include/rdc_lib/impl/RdcGroupSettingsImpl.h index 9480d2e7fe..d616df5762 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcGroupSettingsImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcGroupSettingsImpl.h @@ -61,8 +61,8 @@ class RdcGroupSettingsImpl: public RdcGroupSettings { private: std::map gpu_group_; std::map field_group_; - uint32_t cur_group_id_ = 0; - uint32_t cur_filed_group_id_ = 0; + uint32_t cur_group_id_ = 1; + uint32_t cur_field_group_id_ = 0; std::mutex group_mutex_; std::mutex field_group_mutex_; }; diff --git a/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h b/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h index 6e32a57c02..ea216993ae 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h +++ b/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h @@ -33,11 +33,12 @@ class RdcStandaloneHandler: public RdcHandler { public: // Job RdcAPI rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId, - char job_id[64], uint64_t update_freq, double max_keep_age, - uint32_t max_keep_samples) override; - rdc_status_t rdc_job_get_stats(char jobId[64], + char job_id[64], uint64_t update_freq) override; + rdc_status_t rdc_job_get_stats(char jobId[64], rdc_job_info_t* p_job_info) override; - rdc_status_t rdc_job_stop_stats(char job_id[64]) override; + rdc_status_t rdc_job_stop_stats(char job_id[64]) override; + rdc_status_t rdc_job_remove(char job_id[64]) override; + rdc_status_t rdc_job_remove_all() override; // Discovery RdcAPI rdc_status_t rdc_device_get_all( diff --git a/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h b/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h index e8f2568c2f..266fd91911 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h @@ -22,11 +22,12 @@ THE SOFTWARE. #ifndef RDC_LIB_IMPL_RDCWATCHTABLEIMPL_H_ #define RDC_LIB_IMPL_RDCWATCHTABLEIMPL_H_ +#include #include #include #include #include -#include +#include // NOLINT #include #include "rdc_lib/RdcWatchTable.h" #include "rdc_lib/RdcGroupSettings.h" @@ -45,14 +46,18 @@ struct FieldSettings { uint64_t last_update_time; }; +struct JobWatchTableEntry { + uint32_t group_id; + std::vector fields; //< store fields for faster query +}; class RdcWatchTableImpl : public RdcWatchTable { public: rdc_status_t rdc_job_start_stats(rdc_gpu_group_t group_id, - char job_id[64]) override; - rdc_status_t rdc_watch_job_fields(rdc_gpu_group_t group_id, - uint64_t update_freq, double max_keep_age, - uint32_t max_keep_samples) override; + char job_id[64], uint64_t update_freq) override; + rdc_status_t rdc_job_stop_stats(char job_id[64]) override; + rdc_status_t rdc_job_remove(char job_id[64]) override; + rdc_status_t rdc_job_remove_all() override; rdc_status_t rdc_field_watch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id, uint64_t update_freq, @@ -84,10 +89,16 @@ class RdcWatchTableImpl : public RdcWatchTable { //!< Helper function to clean up the watch table and cache void clean_up(); + //!< Helper function for debug information in watch table and cache + void debug_status(); + //!< Helper function to get the fields using the group and the field group. rdc_status_t get_fields_from_group(rdc_gpu_group_t group_id, - rdc_field_grp_t field_group_id, std::vector & fields); + rdc_field_grp_t field_group_id, + std::vector & fields); // NOLINT + bool is_job_watch_field(uint32_t gpu_index, uint32_t field_id, + std::string& job_id) const; // NOLINT RdcGroupSettingsPtr group_settings_; RdcCacheManagerPtr cache_mgr_; @@ -96,6 +107,10 @@ class RdcWatchTableImpl : public RdcWatchTable { //!< The watch table to store the watch settings. std::map watch_table_; + //!< pairs + std::map job_watch_table_; + + //!< The settings for each field can be deduced from watch_table. But every //!< rdc_field_update_all() call needs to deduce them. To improve the //!< performance, the fields_to_watch_ is used to track the field settings. diff --git a/projects/rdc/include/rdc_lib/rdc_common.h b/projects/rdc/include/rdc_lib/rdc_common.h index ca7b905dc8..d08ddae6fa 100644 --- a/projects/rdc/include/rdc_lib/rdc_common.h +++ b/projects/rdc/include/rdc_lib/rdc_common.h @@ -24,12 +24,18 @@ THE SOFTWARE. #define RDC_LIB_RDC_COMMON_H_ #include +#define RDC_ERROR 0 +#define RDC_INFO 1 +#define RDC_DEBUG 2 -#ifdef DEBUG -#define LOG_DEBUG(message) std::cout << message << std::endl -#else -#define LOG_DEBUG(message) -#endif +#define RDC_LOG(debug_level, msg) do { \ + auto& logger = amd::rdc::RdcLogger::getLogger(); \ + if (logger.should_log((debug_level))) { \ + logger.get_ostream() << \ + logger.get_log_header((debug_level), __FILE__, __LINE__) << \ + msg << std::endl; \ + } \ +} while (0) /** * @brief The strncpy but with null terminated diff --git a/projects/rdc/rdc_libs/CMakeLists.txt b/projects/rdc/rdc_libs/CMakeLists.txt index 2eab4b286c..7d1b39cb3c 100755 --- a/projects/rdc/rdc_libs/CMakeLists.txt +++ b/projects/rdc/rdc_libs/CMakeLists.txt @@ -115,8 +115,10 @@ set(CMAKE_VERBOSE_MAKEFILE on) set(BOOTSTRAP_LIB "rdc_bootstrap") set(BOOTSTRAP_LIB_COMPONENT "lib${BOOTSTRAP_LIB}") set(BOOTSTRAP_LIB_SRC_LIST "${SRC_DIR}/bootstrap/src/RdcBootStrap.cc") +set(BOOTSTRAP_LIB_SRC_LIST ${BOOTSTRAP_LIB_SRC_LIST} "${SRC_DIR}/bootstrap/src/RdcLogger.cc") set(BOOTSTRAP_LIB_INC_LIST "${RDC_LIB_INC_DIR}/rdc/rdc.h") set(BOOTSTRAP_LIB_INC_LIST ${BOOTSTRAP_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/rdc_common.h") +set(BOOTSTRAP_LIB_INC_LIST ${BOOTSTRAP_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcLogger.h") set(BOOTSTRAP_LIB_INC_LIST ${BOOTSTRAP_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcHandler.h") message("BOOTSTRAP_LIB_INC_LIST=${BOOTSTRAP_LIB_INC_LIST}") diff --git a/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc b/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc index 96a5c5cd85..7d066fd558 100644 --- a/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc +++ b/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc @@ -24,6 +24,7 @@ THE SOFTWARE. #include #include "rdc/rdc.h" #include "rdc_lib/RdcHandler.h" +#include "rdc_lib/RdcLogger.h" #include "rdc_lib/rdc_common.h" static void* libHandler = nullptr; @@ -96,7 +97,7 @@ rdc_status_t rdc_start_embedded(rdc_operation_mode_t op_mode, if (!libHandler) { error = dlerror(); - LOG_DEBUG("Fail to open librdc.so: " << error); + RDC_LOG(RDC_ERROR, "Fail to open librdc.so: " << error); return RDC_ST_FAIL_LOAD_MODULE; } @@ -104,7 +105,8 @@ rdc_status_t rdc_start_embedded(rdc_operation_mode_t op_mode, dlsym(libHandler, "make_handler"); if (!func_make_handler) { error = dlerror(); - LOG_DEBUG("Fail to find function make_handler:" << error); + RDC_LOG(RDC_ERROR, + "Fail to find function make_handler:" << error); return RDC_ST_FAIL_LOAD_MODULE; } @@ -144,15 +146,32 @@ rdc_status_t rdc_job_get_stats(rdc_handle_t p_rdc_handle, char job_id[64] , } rdc_status_t rdc_job_start_stats(rdc_handle_t p_rdc_handle, - rdc_gpu_group_t groupId, char job_id[64], uint64_t update_freq, - double max_keep_age, uint32_t max_keep_samples ) { + rdc_gpu_group_t groupId, char job_id[64], + uint64_t update_freq) { if (!p_rdc_handle) { return RDC_ST_INVALID_HANDLER; } return static_cast(p_rdc_handle)-> - rdc_job_start_stats(groupId, job_id, update_freq, - max_keep_age, max_keep_samples); + rdc_job_start_stats(groupId, job_id, update_freq); +} + +rdc_status_t rdc_job_remove(rdc_handle_t p_rdc_handle, char job_id[64]) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_job_remove(job_id); +} + +rdc_status_t rdc_job_remove_all(rdc_handle_t p_rdc_handle) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_job_remove_all(); } @@ -344,6 +363,8 @@ const char* rdc_status_string(rdc_status_t result) { return "The max limit reached"; case RDC_ST_CONFLICT: return "Conflict with current state"; + case RDC_ST_ALREADY_EXIST: + return "The value already exists"; case RDC_ST_CLIENT_ERROR: return "RDC Client error"; default: diff --git a/projects/rdc/rdc_libs/bootstrap/src/RdcLogger.cc b/projects/rdc/rdc_libs/bootstrap/src/RdcLogger.cc new file mode 100644 index 0000000000..f5c28b88d2 --- /dev/null +++ b/projects/rdc/rdc_libs/bootstrap/src/RdcLogger.cc @@ -0,0 +1,78 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "rdc_lib/RdcLogger.h" +#include +#include +#include +#include +#include +#include // NOLINT +#include "rdc_lib/rdc_common.h" + +namespace amd { +namespace rdc { + +RdcLogger::RdcLogger(std::ostream& os): + os_(os) { + char* verbose = getenv("RDC_LOG"); + if (verbose == nullptr) { + log_level_ = RDC_ERROR; + } else if (strcmp(verbose, "DEBUG") == 0) { + log_level_ = RDC_DEBUG; + } else if (strcmp(verbose, "INFO") == 0) { + log_level_ = RDC_INFO; + } else { + log_level_ = RDC_ERROR; + } +} + +std::string RdcLogger::get_log_header(uint32_t severity, + const char* file, int line) { + std::stringstream strstream; + auto ms = std::chrono::duration_cast + (std::chrono::system_clock::now().time_since_epoch()).count(); + strstream << std::fixed << std::setprecision(3) << (ms/1000.0) << " "; + if (severity == RDC_DEBUG) { + strstream << "DEBUG "; + } else if (severity == RDC_INFO) { + strstream << "INFO "; + } else { + strstream << "ERROR "; + } + + // extract out the file path as it may be very long. + if (file != nullptr) { + std::string file_str(file); + auto found = file_str.find_last_of("/"); + if (found != std::string::npos) { + file_str = file_str.substr(found+1); + } + strstream << file_str << "(" << line << "): "; + } + + return strstream.str(); +} + + +} // namespace rdc +} // namespace amd + diff --git a/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc index 9834fa4725..57523ae600 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc @@ -22,6 +22,8 @@ THE SOFTWARE. #include "rdc_lib/impl/RdcCacheManagerImpl.h" #include #include +#include +#include "rdc_lib/RdcLogger.h" #include "rdc_lib/rdc_common.h" @@ -127,15 +129,26 @@ rdc_status_t RdcCacheManagerImpl::rdc_field_get_latest_value( return RDC_ST_OK; } -uint32_t RdcCacheManagerImpl::get_cache_size() { - uint32_t cache_size = 0; +std::string RdcCacheManagerImpl::get_cache_stats() { + std::stringstream strstream; std::lock_guard guard(cache_mutex_); + strstream << "Cache samples:"; auto cache_samples_ite = cache_samples_.begin(); for (; cache_samples_ite != cache_samples_.end(); cache_samples_ite++) { - cache_size+=cache_samples_ite->second.size(); + strstream << "<" << cache_samples_ite->first.first << "," + << cache_samples_ite->first.second << ":" + << cache_samples_ite->second.size() << "> "; } - return cache_size; + + strstream <<" Job caches:"; + auto job_ite = cache_jobs_.begin(); + for ( ; job_ite != cache_jobs_.end(); job_ite++ ) { + strstream << "<" << job_ite->first << ":" + << job_ite->second.gpu_stats.size() << "> "; + } + + return strstream.str(); } rdc_status_t RdcCacheManagerImpl::rdc_update_cache(uint32_t gpu_index, @@ -162,5 +175,188 @@ rdc_status_t RdcCacheManagerImpl::rdc_update_cache(uint32_t gpu_index, return RDC_ST_OK; } +rdc_status_t RdcCacheManagerImpl::rdc_job_remove(char job_id[64]) { + std::lock_guard guard(cache_mutex_); + cache_jobs_.erase(job_id); + return RDC_ST_OK; +} + +rdc_status_t RdcCacheManagerImpl::rdc_job_remove_all() { + std::lock_guard guard(cache_mutex_); + cache_jobs_.clear(); + return RDC_ST_OK; +} + +rdc_status_t RdcCacheManagerImpl::rdc_update_job_stats(uint32_t gpu_index, + const std::string& job_id, const rdc_field_value& value) { + std::lock_guard guard(cache_mutex_); + auto job_iter = cache_jobs_.find(job_id); + if (job_iter == cache_jobs_.end()) { + return RDC_ST_NOT_FOUND; + } + + auto gpu_iter = job_iter->second.gpu_stats.find(gpu_index); + if (gpu_iter == job_iter->second.gpu_stats.end()) { + return RDC_ST_NOT_FOUND; + } + + auto fsummary = gpu_iter->second.field_summaries.find(value.field_id); + if (fsummary == gpu_iter->second.field_summaries.end()) { + return RDC_ST_NOT_FOUND; + } + if (fsummary->second.count == 0) { // first item + fsummary->second.count = 1; + fsummary->second.max_value = value.value.l_int; + fsummary->second.min_value = value.value.l_int; + fsummary->second.total_value = value.value.l_int; + fsummary->second.last_time = value.ts; + if (value.field_id == RDC_FI_POWER_USAGE) { + gpu_iter->second.energy_last_time = value.ts; + } + return RDC_ST_OK; + } + if (value.field_id == RDC_FI_POWER_USAGE) { + uint64_t time_elapsed = value.ts - gpu_iter->second.energy_last_time; + // Stored in cache as microseconds and microwats + gpu_iter->second.energy_consumed += + (time_elapsed * value.value.l_int)/(1000.0*1000000); + } + fsummary->second.max_value = std::max(fsummary->second.max_value, + static_cast(value.value.l_int)); + fsummary->second.min_value = std::min(fsummary->second.min_value, + static_cast(value.value.l_int)); + fsummary->second.total_value += value.value.l_int; + fsummary->second.last_time = value.ts; + fsummary->second.count++; + + return RDC_ST_OK; +} + +void RdcCacheManagerImpl::set_summary(const FieldSummaryStats & stats, + rdc_stats_summary_t & gpu, rdc_stats_summary_t& summary, + unsigned int adjuster) { + if (stats.count == 0) return; + + gpu.max_value = stats.max_value / adjuster; + gpu.min_value = stats.min_value / adjuster; + gpu.average = stats.total_value / stats.count / adjuster; + summary.max_value = std::max(summary.max_value, gpu.max_value); + summary.min_value = std::min(summary.min_value, gpu.min_value); + //< save total for future average calculation. + summary.average += gpu.average; +} + +rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(char jobId[64], + const rdc_gpu_total_memory_t& total_memory, + rdc_job_info_t* p_job_info) { + std::lock_guard guard(cache_mutex_); + auto job_stats = cache_jobs_.find(jobId); + + if (job_stats == cache_jobs_.end()) { + return RDC_ST_NOT_FOUND; + } + + //< Init the summary info + RDC_LOG(RDC_DEBUG, "rdc_job_get_stats for job " <summary; + summary_info.start_time = job_stats->second.start_time; + if (job_stats->second.end_time == 0) { + summary_info.end_time = time(nullptr); + } else { + summary_info.end_time = job_stats->second.end_time; + } + summary_info.energy_consumed = 0; + summary_info.max_gpu_memory_used = 0; + summary_info.power_usage = {0, std::numeric_limits::max(), 0}; + summary_info.gpu_clock = {0, std::numeric_limits::max(), 0}; + summary_info.gpu_utilization = {0, std::numeric_limits::max(), 0}; + summary_info.memory_utilization = {0, + std::numeric_limits::max(), 0}; + + p_job_info->num_gpus = job_stats->second.gpu_stats.size(); + + //< Populate information for each GPUs + + auto gpus = job_stats->second.gpu_stats.begin(); + for (; gpus != job_stats->second.gpu_stats.end(); gpus++) { + auto & gpu_info = p_job_info->gpus[gpus->first]; + gpu_info.start_time = summary_info.start_time; + gpu_info.end_time = summary_info.end_time; + gpu_info.energy_consumed = gpus->second.energy_consumed; + summary_info.energy_consumed += gpu_info.energy_consumed; + + auto ite = gpus->second.field_summaries.begin(); + for (; ite != gpus->second.field_summaries.end(); ite++) { + if (ite->first == RDC_FI_POWER_USAGE) { + set_summary(ite->second, + gpu_info.power_usage, summary_info.power_usage, 1000000); + } else if (ite->first == RDC_FI_GPU_MEMORY_USAGE) { + auto tmemory = total_memory.at(gpus->first); + set_summary(ite->second, gpu_info.memory_utilization, + summary_info.memory_utilization, tmemory/100); + gpu_info.max_gpu_memory_used = ite->second.max_value; + summary_info.max_gpu_memory_used = std::max( + summary_info.max_gpu_memory_used, + gpu_info.max_gpu_memory_used); + } else if (ite->first == RDC_FI_GPU_SM_CLOCK) { + set_summary(ite->second, gpu_info.gpu_clock, + summary_info.gpu_clock, 1000000); + } else if (ite->first == RDC_FI_GPU_UTIL) { + set_summary(ite->second, gpu_info.gpu_utilization, + summary_info.gpu_utilization, 1); + } + } + } + // Get the average of the summary + summary_info.power_usage.average = summary_info.power_usage.average/ + p_job_info->num_gpus; + summary_info.gpu_clock.average = summary_info.gpu_clock.average/ + p_job_info->num_gpus; + summary_info.gpu_utilization.average = summary_info.gpu_utilization.average/ + p_job_info->num_gpus; + summary_info.memory_utilization.average = + summary_info.memory_utilization.average/p_job_info->num_gpus; + + return RDC_ST_OK; +} + +rdc_status_t RdcCacheManagerImpl::rdc_job_start_stats(char job_id[64], + const rdc_group_info_t& ginfo, const rdc_field_group_info_t& finfo) { + RdcJobStatsCacheEntry cacheEntry; + cacheEntry.start_time = std::time(nullptr); + cacheEntry.end_time = 0; + for (unsigned int i=0 ; i < ginfo.count; i++) { // GPUs + GpuSummaryStats gstats; + gstats.energy_consumed = 0; + gstats.energy_last_time = 0; + for (unsigned int j = 0; j < finfo.count; j++) { // init fields + FieldSummaryStats s; + s.count = 0; + s.max_value = s.min_value = s.total_value = 0; + gstats.field_summaries.insert({finfo.field_ids[j], s}); + } + + cacheEntry.gpu_stats.insert({ginfo.entity_ids[i], gstats}); + } + + std::lock_guard guard(cache_mutex_); + cache_jobs_.insert({job_id, cacheEntry}); + return RDC_ST_OK; +} + + +rdc_status_t RdcCacheManagerImpl::rdc_job_stop_stats(char job_id[64]) { + std::lock_guard guard(cache_mutex_); + auto job_stats = cache_jobs_.find(job_id); + + if (job_stats == cache_jobs_.end()) { + return RDC_ST_NOT_FOUND; + } + + job_stats->second.end_time = std::time(nullptr); + + return RDC_ST_OK; +} + } // namespace rdc } // namespace amd diff --git a/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc b/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc index 8540fccfc7..94313d3610 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc @@ -27,6 +27,7 @@ THE SOFTWARE. #include "rdc_lib/impl/RdcCacheManagerImpl.h" #include "rdc_lib/impl/RdcWatchTableImpl.h" #include "rdc_lib/rdc_common.h" +#include "rdc_lib/RdcLogger.h" #include "rdc_lib/RdcException.h" #include "rocm_smi/rocm_smi.h" @@ -73,6 +74,7 @@ RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode): , metrics_updater_(new RdcMetricsUpdaterImpl(watch_table_, METIC_UPDATE_FREQUENCY)) { if (mode == RDC_OPERATION_MODE_AUTO) { + RDC_LOG(RDC_DEBUG, "Run RDC with RDC_OPERATION_MODE_AUTO"); metrics_updater_->start(); } } @@ -83,32 +85,49 @@ RdcEmbeddedHandler::~RdcEmbeddedHandler() { // JOB API rdc_status_t RdcEmbeddedHandler::rdc_job_start_stats(rdc_gpu_group_t groupId, - char job_id[64], uint64_t update_freq, double max_keep_age, - uint32_t max_keep_samples) { - // TODO(bill_liu): implement - (void)(groupId); - (void)(job_id); - (void)(update_freq); - (void)(max_keep_age); - (void)(max_keep_samples); - - return RDC_ST_OK; + char job_id[64], uint64_t update_freq) { + return watch_table_->rdc_job_start_stats(groupId, job_id, update_freq); } -rdc_status_t RdcEmbeddedHandler::rdc_job_get_stats(char job_id[64], +rdc_status_t RdcEmbeddedHandler::rdc_job_get_stats(char job_id[64], rdc_job_info_t* p_job_info) { - // TODO(bill_liu): implement - (void)(job_id); - (void)(p_job_info); - return RDC_ST_OK; + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES]; + uint32_t count = 0; + rdc_status_t status = rdc_device_get_all( + gpu_index_list, &count); + if (status != RDC_ST_OK) { + return status; + } + + rdc_gpu_total_memory_t all_total_memory; + + for (uint32_t i = 0; i < count ; i++) { + rdc_field_value total_memory; + status = metric_fetcher_->fetch_smi_field(gpu_index_list[i], + RDC_FI_GPU_MEMORY_TOTAL, &total_memory); + if (status != RDC_ST_OK) { + RDC_LOG(RDC_ERROR, "Fail to get total memory of GPU " + << gpu_index_list[i]); + return status; + } + all_total_memory.insert({gpu_index_list[i], total_memory.value.l_int}); + } + + return cache_mgr_->rdc_job_get_stats(job_id, all_total_memory, p_job_info); } -rdc_status_t RdcEmbeddedHandler::rdc_job_stop_stats(char job_id[64] ) { - // TODO(bill_liu): implement - (void)(job_id); - return RDC_ST_OK; +rdc_status_t RdcEmbeddedHandler::rdc_job_stop_stats(char job_id[64]) { + return watch_table_->rdc_job_stop_stats(job_id); } +rdc_status_t RdcEmbeddedHandler::rdc_job_remove(char job_id[64]) { + return watch_table_->rdc_job_remove(job_id); +} + + +rdc_status_t RdcEmbeddedHandler::rdc_job_remove_all() { + return watch_table_->rdc_job_remove_all(); +} // Discovery API rdc_status_t RdcEmbeddedHandler::rdc_device_get_all( @@ -194,6 +213,8 @@ rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_add(rdc_gpu_group_t group_id, } if (!is_gpu_exist) { + RDC_LOG(RDC_INFO, "Fail to add GPU index " << gpu_index << " to group " + << group_id <<" as the GPU index is invalid."); return RDC_ST_NOT_FOUND; } @@ -211,6 +232,9 @@ rdc_status_t RdcEmbeddedHandler::rdc_group_field_create(uint32_t num_field_ids, if (num_field_ids <= RDC_MAX_FIELD_IDS_PER_FIELD_GROUP) { for (uint32_t i = 0; i < num_field_ids; i++) { if (!metric_fetcher_->is_field_valid(field_ids[i])) { + RDC_LOG(RDC_INFO, + "Fail to create field group with unknown field id " + << field_ids[i]); return RDC_ST_NOT_SUPPORTED; } } @@ -285,6 +309,9 @@ rdc_status_t RdcEmbeddedHandler::rdc_field_get_latest_value( return RDC_ST_BAD_PARAMETER; } if (!metric_fetcher_->is_field_valid(field)) { + RDC_LOG(RDC_INFO, + "Fail to get latest value with unknown field id " + << field); return RDC_ST_NOT_SUPPORTED; } return cache_mgr_->rdc_field_get_latest_value(gpu_index, field, value); @@ -297,6 +324,9 @@ rdc_status_t RdcEmbeddedHandler::rdc_field_get_value_since(uint32_t gpu_index, return RDC_ST_BAD_PARAMETER; } if (!metric_fetcher_->is_field_valid(field)) { + RDC_LOG(RDC_INFO, + "Fail to get value since with unknown field id " + << field); return RDC_ST_NOT_SUPPORTED; } return cache_mgr_->rdc_field_get_value_since(gpu_index, field, diff --git a/projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc index 21d57f4136..eb89ddb505 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc @@ -22,11 +22,20 @@ THE SOFTWARE. #include "rdc_lib/impl/RdcGroupSettingsImpl.h" #include #include "rdc_lib/rdc_common.h" +#include "rdc_lib/RdcLogger.h" namespace amd { namespace rdc { RdcGroupSettingsImpl::RdcGroupSettingsImpl() { + // Add the default job stats fields + uint32_t job_fields[] = {RDC_FI_GPU_MEMORY_USAGE, + RDC_FI_POWER_USAGE, RDC_FI_GPU_SM_CLOCK, RDC_FI_GPU_UTIL}; + char job_field_group[] = "JobStatsFields"; + rdc_field_grp_t fgid = JOB_FIELD_ID; + + rdc_group_field_create(sizeof(job_fields)/sizeof(uint32_t), + job_fields, job_field_group, &fgid); } rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_create( @@ -62,6 +71,8 @@ rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_add( // Check whether the index already exists for (uint32_t i=0; i < ite->second.count; i++) { if (ite->second.entity_ids[i] == gpu_index) { + RDC_LOG(RDC_INFO, "Fail to add " << gpu_index + <<" to GPU group " << groupId << " as it is already exists"); return RDC_ST_BAD_PARAMETER; } } @@ -136,15 +147,19 @@ rdc_status_t RdcGroupSettingsImpl::rdc_group_field_create( if (field_group_.size() >= RDC_MAX_NUM_FIELD_GROUPS) { return RDC_ST_MAX_LIMIT; } - field_group_.emplace(cur_filed_group_id_, finfo); - *rdc_field_group_id = cur_filed_group_id_; - cur_filed_group_id_++; + field_group_.emplace(cur_field_group_id_, finfo); + *rdc_field_group_id = cur_field_group_id_; + cur_field_group_id_++; return RDC_ST_OK; } rdc_status_t RdcGroupSettingsImpl::rdc_group_field_destroy( rdc_field_grp_t rdc_field_group_id) { + if (rdc_field_group_id == JOB_FIELD_ID) { + RDC_LOG(RDC_INFO, "Cannot delete system JOB_FIELD_ID field group"); + return RDC_ST_BAD_PARAMETER; + } std::lock_guard guard(field_group_mutex_); field_group_.erase(rdc_field_group_id); return RDC_ST_OK; @@ -183,6 +198,10 @@ rdc_status_t RdcGroupSettingsImpl::rdc_group_field_get_all_ids( if (*count >= RDC_MAX_NUM_FIELD_GROUPS) { return RDC_ST_MAX_LIMIT; } + + // Skip system defined JOB_FIELD_ID + if (ite->first == JOB_FIELD_ID) continue; + field_group_id_list[*count] = ite->first; (*count)++; } diff --git a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index 35ad6ed32f..114510b8e5 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -22,10 +22,11 @@ THE SOFTWARE. #include "rdc_lib/impl/RdcMetricFetcherImpl.h" #include #include -#include +#include //NOLINT #include #include #include "rdc_lib/rdc_common.h" +#include "rdc_lib/RdcLogger.h" #include "rocm_smi/rocm_smi.h" namespace amd { @@ -48,6 +49,8 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, uint64_t i64 = 0; if (!is_field_valid(field_id)) { + RDC_LOG(RDC_ERROR, "Fail to fetch field " << field_id + << " which is not supported"); return RDC_ST_NOT_SUPPORTED; } @@ -125,6 +128,27 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, break; } + gettimeofday(&tv, NULL); + int64_t latency = static_cast(tv.tv_sec)*1000+tv.tv_usec/1000 + - value->ts; + if (value->status != RSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "Fail to fetch " << gpu_index << ":" << + field_id_string(field_id) << " with rsmi error code " + << value->status <<", latency " << latency); + } else if (value->type == INTEGER) { + RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" << + field_id_string(field_id) << ":" << value->value.l_int + << ", latency " << latency); + } else if (value->type == DOUBLE) { + RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" << + field_id_string(field_id) << ":" << value->value.dbl + << ", latency " << latency); + } else if (value->type == STRING) { + RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" << + field_id_string(field_id) << ":" << value->value.str + << ", latency " << latency); + } + return value->status == RSMI_STATUS_SUCCESS ? RDC_ST_OK : RDC_ST_MSI_ERROR; } diff --git a/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc index 979e5eb536..47bc11fb4c 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc @@ -23,8 +23,10 @@ THE SOFTWARE. #include "rdc_lib/impl/RdcWatchTableImpl.h" #include #include +#include #include #include "rdc_lib/rdc_common.h" +#include "rdc_lib/RdcLogger.h" namespace amd { namespace rdc { @@ -39,24 +41,92 @@ RdcWatchTableImpl::RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings, } rdc_status_t RdcWatchTableImpl::rdc_job_start_stats(rdc_gpu_group_t group_id, - char job_id[64]) { - // TODO(bill_liu): implement - (void)(group_id); - (void)(job_id); - return RDC_ST_OK; + char job_id[64], uint64_t update_freq) { + do { //< lock guard for thread safe + std::lock_guard guard(watch_mutex_); + if (job_watch_table_.find(job_id) != job_watch_table_.end()) { + return RDC_ST_ALREADY_EXIST; + } + } while (0); + + std::vector fields_in_watch; + rdc_status_t result = get_fields_from_group(group_id, + JOB_FIELD_ID, fields_in_watch); + JobWatchTableEntry jentry {group_id, fields_in_watch}; + job_watch_table_.insert({job_id, jentry}); + + result = rdc_field_watch(group_id, JOB_FIELD_ID, update_freq, 0, 0); + if (result != RDC_ST_OK) { + return result; + } + + rdc_field_group_info_t finfo; + rdc_group_info_t ginfo; + result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo); + if (result != RDC_ST_OK) { + return result; + } + result = group_settings_->rdc_group_field_get_info(JOB_FIELD_ID, &finfo); + if (result != RDC_ST_OK) { + return result; + } + + result = cache_mgr_->rdc_job_start_stats(job_id, ginfo, finfo); + + return result; } -rdc_status_t RdcWatchTableImpl::rdc_watch_job_fields(rdc_gpu_group_t group_id, - uint64_t update_freq, double max_keep_age, - uint32_t max_keep_samples) { - // TODO(bill_liu): implement - (void)(group_id); - (void)(update_freq); - (void)(max_keep_age); - (void)(max_keep_samples); - return RDC_ST_OK; +rdc_status_t RdcWatchTableImpl::rdc_job_stop_stats(char job_id[64]) { + uint32_t job_group_id; + do { //< lock guard for thread safe + std::lock_guard guard(watch_mutex_); + auto job = job_watch_table_.find(job_id); + if (job == job_watch_table_.end()) { + return RDC_ST_NOT_FOUND; + } + job_group_id = job->second.group_id; + } while (0); + + rdc_status_t result = rdc_field_unwatch(job_group_id, JOB_FIELD_ID); + if (result != RDC_ST_OK) { + return result; + } + + do { //< lock guard for thread safe + std::lock_guard guard(watch_mutex_); + job_watch_table_.erase(job_id); + } while (0); + + result = cache_mgr_->rdc_job_stop_stats(job_id); + + return result; } +rdc_status_t RdcWatchTableImpl::rdc_job_remove(char job_id[64]) { + rdc_job_stop_stats(job_id); + return cache_mgr_->rdc_job_remove(job_id); +} + +rdc_status_t RdcWatchTableImpl::rdc_job_remove_all() { + // Get all the job ids; + std::vector v; + do { //< lock guard for thread safe + std::lock_guard guard(watch_mutex_); + for (auto ite = job_watch_table_.begin(); + ite != job_watch_table_.end(); ite++) { + v.push_back(ite->first); + } + } while (0); + + // Stop them + for (auto job = v.begin(); job != v.end(); job++) { + rdc_job_stop_stats(const_cast(job->c_str())); + } + + return cache_mgr_->rdc_job_remove_all(); +} + + rdc_status_t RdcWatchTableImpl::get_fields_from_group(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id, std::vector & fields) { rdc_field_group_info_t finfo; @@ -228,6 +298,21 @@ rdc_status_t RdcWatchTableImpl::rdc_field_unwatch( return update_field_in_table_when_unwatch(ite->first); } +bool RdcWatchTableImpl::is_job_watch_field(uint32_t gpu_index, + uint32_t field_id, std::string& job_id) const { + RdcFieldKey key{gpu_index, field_id}; + + for (auto ite = job_watch_table_.begin(); + ite != job_watch_table_.end(); ite++) { + auto& fields = ite->second.fields; + if (std::find(fields.begin(), fields.end(), key) != fields.end()) { + job_id = ite->first; + return true; + } + } + + return false; +} rdc_status_t RdcWatchTableImpl::rdc_field_update_all() { uint32_t items_fetched = 0; @@ -251,13 +336,19 @@ rdc_status_t RdcWatchTableImpl::rdc_field_update_all() { result = metric_fetcher_->fetch_smi_field( fite->first.first, fite->first.second, &value); if (result != RDC_ST_OK) { - LOG_DEBUG("Fail to fetch the field: " << rdc_status_string(result)); continue; } // Update the cache cache_mgr_->rdc_update_cache(fite->first.first, value); + // Update the job stats cache + std::string job_id; + if (is_job_watch_field(fite->first.first, fite->first.second, job_id)) { + cache_mgr_->rdc_update_job_stats(fite->first.first, job_id, value); + } + + // Update the last_upate_time gettimeofday(&tv, NULL); now = static_cast(tv.tv_sec)*1000+tv.tv_usec/1000; @@ -303,6 +394,56 @@ void RdcWatchTableImpl::clean_up() { ++wite; } } + + // Debug log every 30 seconds + if (now/1000%30 == 0) { + debug_status(); + } +} + +void RdcWatchTableImpl::debug_status() { + RDC_LOG(RDC_DEBUG, "fields_to_watch_:" << fields_to_watch_.size() + << " watch_table_:" << watch_table_.size() + << " job_watch_table_:" << job_watch_table_.size() + << " cache stats:" << cache_mgr_->get_cache_stats()); + + if (watch_table_.size() > 0) { + RDC_LOG(RDC_DEBUG, "watch table details:"); + } + for (auto wite = watch_table_.begin(); wite != watch_table_.end(); wite++) { + RDC_LOG(RDC_DEBUG, wite->first.first << "," << wite->first.second + << ": age:" << wite->second.max_keep_age << ", samples:" + << wite->second.max_keep_samples << ", is_watching:" + << wite->second.is_watching << ", last_update_time:" + << wite->second.last_update_time <<", update_freq:" + << wite->second.update_freq); + } + + if (job_watch_table_.size() > 0) { + RDC_LOG(RDC_DEBUG, "job watch table details: "); + } + for (auto jite = job_watch_table_.begin(); + jite !=job_watch_table_.end(); jite++) { + std::stringstream strstream; + for (const auto& p : jite->second.fields) { + strstream << "<" << p.first << "," << p.second << "> "; + } + RDC_LOG(RDC_DEBUG, jite->first << ": " << jite->second.group_id + << " fields : "<< strstream.str()); + } + + if (fields_to_watch_.size() > 0) { + RDC_LOG(RDC_DEBUG, "fields to watch details:"); + } + for (auto fite = fields_to_watch_.begin(); fite != fields_to_watch_.end(); + fite++) { + RDC_LOG(RDC_DEBUG, fite->first.first << "," << fite->first.second + << ": age:" << fite->second.max_keep_age << ", samples:" + << fite->second.max_keep_samples << ", is_watching:" + << fite->second.is_watching << ", last_update_time:" + << fite->second.last_update_time <<", update_freq:" + << fite->second.update_freq); + } } } // namespace rdc diff --git a/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc b/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc index 810cfb8c99..3c7088e3a8 100644 --- a/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc +++ b/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc @@ -62,19 +62,16 @@ rdc_status_t RdcStandaloneHandler::error_handle(::grpc::Status status, // JOB RdcAPI rdc_status_t RdcStandaloneHandler::rdc_job_start_stats(rdc_gpu_group_t groupId, - char job_id[64], uint64_t update_freq, double max_keep_age, - uint32_t max_keep_samples) { + char job_id[64], uint64_t update_freq) { // TODO(bill_liu): implement (void)(groupId); (void)(job_id); (void)(update_freq); - (void)(max_keep_age); - (void)(max_keep_samples); return RDC_ST_OK; } -rdc_status_t RdcStandaloneHandler::rdc_job_get_stats(char job_id[64], +rdc_status_t RdcStandaloneHandler::rdc_job_get_stats(char job_id[64], rdc_job_info_t* p_job_info) { // TODO(bill_liu): implement (void)(job_id); @@ -82,13 +79,25 @@ rdc_status_t RdcStandaloneHandler::rdc_job_get_stats(char job_id[64], return RDC_ST_OK; } -rdc_status_t RdcStandaloneHandler::rdc_job_stop_stats(char job_id[64] ) { +rdc_status_t RdcStandaloneHandler::rdc_job_stop_stats(char job_id[64]) { // TODO(bill_liu): implement (void)(job_id); return RDC_ST_OK; } +rdc_status_t RdcStandaloneHandler::rdc_job_remove(char job_id[64]) { + // TODO(bill_liu): implement + (void)(job_id); + return RDC_ST_OK; +} + +rdc_status_t RdcStandaloneHandler::rdc_job_remove_all() { + // TODO(bill_liu): implement + return RDC_ST_OK; +} + + // Discovery RdcAPI rdc_status_t RdcStandaloneHandler::rdc_device_get_all( uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) { diff --git a/projects/rdc/rdci/src/RdciDmonSubSystem.cc b/projects/rdc/rdci/src/RdciDmonSubSystem.cc index f073ba65c2..c1618d785d 100644 --- a/projects/rdc/rdci/src/RdciDmonSubSystem.cc +++ b/projects/rdc/rdci/src/RdciDmonSubSystem.cc @@ -58,6 +58,7 @@ void RdciDmonSubSystem::parse_cmd_opts(int argc, char ** argv) { const struct option long_options[] = { {"host", required_argument, nullptr, HOST_OPTIONS }, {"help", optional_argument, nullptr, 'h' }, + {"unauth", optional_argument, nullptr, 'u' }, {"list", optional_argument, nullptr, 'l' }, {"field-group-id", required_argument, nullptr, 'f' }, {"field-id", required_argument, nullptr, 'e' }, @@ -73,7 +74,7 @@ void RdciDmonSubSystem::parse_cmd_opts(int argc, char ** argv) { std::string gpu_indexes; std::string field_ids; - while ((opt = getopt_long(argc, argv, "hlf:g:c:d:e:i:", + while ((opt = getopt_long(argc, argv, "hluf:g:c:d:e:i:", long_options, &option_index)) != -1) { switch (opt) { case HOST_OPTIONS: @@ -82,9 +83,12 @@ void RdciDmonSubSystem::parse_cmd_opts(int argc, char ** argv) { case 'h': dmon_ops_ = DMON_HELP; return; + case 'u': + use_auth_ = false; + break; case 'l': dmon_ops_ = DMON_LIST_FIELDS; - return; + break; case 'f': if (!IsNumber(optarg)) { show_help(); @@ -130,6 +134,10 @@ void RdciDmonSubSystem::parse_cmd_opts(int argc, char ** argv) { } } + if (dmon_ops_ == DMON_LIST_FIELDS) { + return; + } + if (options_.find(OPTIONS_FIELD_GROUP_ID) == options_.end()) { if (field_ids == "") { show_help(); diff --git a/projects/rdc/rdci/src/RdciFieldGroupSubSystem.cc b/projects/rdc/rdci/src/RdciFieldGroupSubSystem.cc index 4d9b07d6c7..4961cd6698 100644 --- a/projects/rdc/rdci/src/RdciFieldGroupSubSystem.cc +++ b/projects/rdc/rdci/src/RdciFieldGroupSubSystem.cc @@ -40,6 +40,7 @@ void RdciFieldGroupSubSystem::parse_cmd_opts(int argc, char ** argv) { const struct option long_options[] = { {"host", required_argument, nullptr, HOST_OPTIONS }, {"help", optional_argument, nullptr, 'h' }, + {"unauth", optional_argument, nullptr, 'u' }, {"list", optional_argument, nullptr, 'l' }, {"group", required_argument, nullptr, 'g'}, {"create", required_argument, nullptr, 'c' }, @@ -52,7 +53,7 @@ void RdciFieldGroupSubSystem::parse_cmd_opts(int argc, char ** argv) { int option_index = 0; int opt = 0; - while ((opt = getopt_long(argc, argv, "hlif:c:g:d:", + while ((opt = getopt_long(argc, argv, "hluif:c:g:d:", long_options, &option_index)) != -1) { switch (opt) { case HOST_OPTIONS: @@ -61,6 +62,9 @@ void RdciFieldGroupSubSystem::parse_cmd_opts(int argc, char ** argv) { case 'h': field_group_ops_ = FIELD_GROUP_HELP; return; + case 'u': + use_auth_ = false; + break; case 'l': field_group_ops_ = FIELD_GROUP_LIST; break; diff --git a/projects/rdc/rdci/src/RdciGroupSubSystem.cc b/projects/rdc/rdci/src/RdciGroupSubSystem.cc index e8f58879cb..c105333217 100644 --- a/projects/rdc/rdci/src/RdciGroupSubSystem.cc +++ b/projects/rdc/rdci/src/RdciGroupSubSystem.cc @@ -40,6 +40,7 @@ void RdciGroupSubSystem::parse_cmd_opts(int argc, char ** argv) { const struct option long_options[] = { {"host", required_argument, nullptr, HOST_OPTIONS }, {"help", optional_argument, nullptr, 'h' }, + {"unauth", optional_argument, nullptr, 'u' }, {"list", optional_argument, nullptr, 'l' }, {"group", required_argument, nullptr, 'g'}, {"create", required_argument, nullptr, 'c' }, @@ -52,7 +53,7 @@ void RdciGroupSubSystem::parse_cmd_opts(int argc, char ** argv) { int option_index = 0; int opt = 0; - while ((opt = getopt_long(argc, argv, "hlic:g:a:d:", + while ((opt = getopt_long(argc, argv, "hluic:g:a:d:", long_options, &option_index)) != -1) { switch (opt) { case HOST_OPTIONS: @@ -61,6 +62,9 @@ void RdciGroupSubSystem::parse_cmd_opts(int argc, char ** argv) { case 'h': group_ops_ = GROUP_HELP; return; + case 'u': + use_auth_ = false; + break; case 'l': group_ops_ = GROUP_LIST; break;