From f4a3fd4dda4ba397f492ca22aac9679115c5e7c2 Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Wed, 29 Apr 2020 10:32:50 -0400 Subject: [PATCH] Support extra metrics in the RDC Remove the * in the rdci stats When a group is created, the GPUs can be added in the same command. Add the support to the memory temperature. Add the support to the memory clock. Add the support to report the ECC errors. Add the support to report the PCIe bandwidth throughput. Since the RX/TX throughput may take 1 second to retreive, an async fetch is implemented in the RdcMetricFetcherImpl. Change-Id: If04f602fe1f2d14dbf7c2fb189549fd030523f9a --- include/rdc/rdc.h | 38 ++++ include/rdc_lib/RdcCacheManager.h | 11 +- include/rdc_lib/RdcWatchTable.h | 6 +- include/rdc_lib/impl/RdcCacheManagerImpl.h | 10 +- include/rdc_lib/impl/RdcEmbeddedHandler.h | 1 + include/rdc_lib/impl/RdcMetricFetcherImpl.h | 39 ++++ include/rdc_lib/impl/RdcWatchTableImpl.h | 6 +- include/rdc_lib/rdc_common.h | 8 + protos/rdc.proto | 6 + rdc_libs/bootstrap/src/RdcBootStrap.cc | 6 + rdc_libs/rdc/src/RdcCacheManagerImpl.cc | 108 ++++++++- rdc_libs/rdc/src/RdcEmbeddedHandler.cc | 57 ++++- rdc_libs/rdc/src/RdcGroupSettingsImpl.cc | 4 +- rdc_libs/rdc/src/RdcMetricFetcherImpl.cc | 207 +++++++++++++++++- rdc_libs/rdc/src/RdcWatchTableImpl.cc | 32 ++- .../rdc_client/src/RdcStandaloneHandler.cc | 22 ++ rdci/src/RdciDmonSubSystem.cc | 13 +- rdci/src/RdciGroupSubSystem.cc | 29 ++- rdci/src/RdciStatsSubSystem.cc | 32 ++- server/src/rdc_api_service.cc | 28 ++- 20 files changed, 601 insertions(+), 62 deletions(-) diff --git a/include/rdc/rdc.h b/include/rdc/rdc.h index f429342727..c9de84715d 100755 --- a/include/rdc/rdc.h +++ b/include/rdc/rdc.h @@ -144,16 +144,48 @@ typedef enum { */ #define RDC_FI_GPU_SM_CLOCK 100 +/** + * Clock for the memory + */ +#define RDC_FI_MEM_CLOCK 101 + +/** + * PCIe Tx utilization information + */ +#define RDC_FI_PCIE_TX 200 + +/** + * PCIe Rx utilization information + */ +#define RDC_FI_PCIE_RX 201 + + /** * GPU Utilization */ #define RDC_FI_GPU_UTIL 203 +/** + * Accumulated correctable ECC errors + */ +#define RDC_FI_ECC_CORRECT_TOTAL 312 + +/** + * Accumulated uncorrectable ECC errors + */ +#define RDC_FI_ECC_UNCORRECT_TOTAL 313 + +/** + * Memory temperature for the device + */ +#define RDC_FI_MEMORY_TEMP 140 + /** * Current temperature for the device */ #define RDC_FI_GPU_TEMP 150 + /** * GPU count in the system */ @@ -209,9 +241,15 @@ typedef struct { uint64_t end_time; //!< The time to stop the watching uint64_t energy_consumed; + uint64_t ecc_correct; + uint64_t ecc_uncorrect; + rdc_stats_summary_t pcie_tx; + rdc_stats_summary_t pcie_rx; rdc_stats_summary_t power_usage; rdc_stats_summary_t gpu_clock; + rdc_stats_summary_t memory_clock; rdc_stats_summary_t gpu_utilization; + rdc_stats_summary_t gpu_temperature; uint64_t max_gpu_memory_used; rdc_stats_summary_t memory_utilization; diff --git a/include/rdc_lib/RdcCacheManager.h b/include/rdc_lib/RdcCacheManager.h index 77eab78123..0883bf8f7f 100644 --- a/include/rdc_lib/RdcCacheManager.h +++ b/include/rdc_lib/RdcCacheManager.h @@ -32,7 +32,6 @@ THE SOFTWARE. namespace amd { namespace rdc { -typedef std::map rdc_gpu_total_memory_t; class RdcCacheManager { public: @@ -48,12 +47,14 @@ class RdcCacheManager { virtual std::string get_cache_stats() = 0; virtual rdc_status_t rdc_job_get_stats(char jobId[64], - const rdc_gpu_total_memory_t& total_memory, + const rdc_gpu_gauges_t& gpu_gauges, rdc_job_info_t* p_job_info) = 0; virtual rdc_status_t rdc_job_start_stats(char jobId[64], const rdc_group_info_t& group, - const rdc_field_group_info_t& finfo) = 0; - virtual rdc_status_t rdc_job_stop_stats(char job_id[64]) = 0; + const rdc_field_group_info_t& finfo, + const rdc_gpu_gauges_t& gpu_gauges) = 0; + virtual rdc_status_t rdc_job_stop_stats(char job_id[64], + const rdc_gpu_gauges_t& gpu_gauge) = 0; virtual rdc_status_t rdc_update_job_stats(uint32_t gpu_index, const std::string& job_id, const rdc_field_value& value) = 0; virtual rdc_status_t rdc_job_remove(char job_id[64]) = 0; @@ -64,8 +65,6 @@ class RdcCacheManager { typedef std::shared_ptr RdcCacheManagerPtr; -// -typedef std::pair RdcFieldKey; } // namespace rdc } // namespace amd diff --git a/include/rdc_lib/RdcWatchTable.h b/include/rdc_lib/RdcWatchTable.h index 34f01750b0..4adb2c39b7 100644 --- a/include/rdc_lib/RdcWatchTable.h +++ b/include/rdc_lib/RdcWatchTable.h @@ -36,8 +36,10 @@ class RdcWatchTable { virtual rdc_status_t rdc_field_update_all() = 0; virtual rdc_status_t rdc_job_start_stats(rdc_gpu_group_t group_id, - char job_id[64], uint64_t update_freq) = 0; - virtual rdc_status_t rdc_job_stop_stats(char job_id[64]) = 0; + char job_id[64], uint64_t update_freq, + const rdc_gpu_gauges_t& gpu_gauge) = 0; + virtual rdc_status_t rdc_job_stop_stats(char job_id[64], + const rdc_gpu_gauges_t& gpu_gauge) = 0; virtual rdc_status_t rdc_job_remove(char job_id[64]) = 0; virtual rdc_status_t rdc_job_remove_all() = 0; diff --git a/include/rdc_lib/impl/RdcCacheManagerImpl.h b/include/rdc_lib/impl/RdcCacheManagerImpl.h index 952f376657..f8bf9d9ebd 100644 --- a/include/rdc_lib/impl/RdcCacheManagerImpl.h +++ b/include/rdc_lib/impl/RdcCacheManagerImpl.h @@ -53,6 +53,8 @@ struct FieldSummaryStats { struct GpuSummaryStats { uint64_t energy_consumed; uint64_t energy_last_time; + uint64_t ecc_correct_init; // Init counter when job starts + uint64_t ecc_uncorrect_init; // Init counter when job starts std::map field_summaries; }; @@ -80,12 +82,14 @@ class RdcCacheManagerImpl: public RdcCacheManager { std::string get_cache_stats() override; rdc_status_t rdc_job_get_stats(char job_id[64], - const rdc_gpu_total_memory_t& total_memory, + const rdc_gpu_gauges_t& gpu_gauges, rdc_job_info_t* p_job_info) override; rdc_status_t rdc_job_start_stats(char job_id[64], const rdc_group_info_t& group, - const rdc_field_group_info_t& finfo) override; - rdc_status_t rdc_job_stop_stats(char job_id[64]) override; + const rdc_field_group_info_t& finfo, + const rdc_gpu_gauges_t& gpu_gauges) override; + rdc_status_t rdc_job_stop_stats(char job_id[64], + const rdc_gpu_gauges_t& gpu_gauge) override; rdc_status_t rdc_update_job_stats(uint32_t gpu_index, const std::string& job_id, const rdc_field_value& value) override; diff --git a/include/rdc_lib/impl/RdcEmbeddedHandler.h b/include/rdc_lib/impl/RdcEmbeddedHandler.h index 57d4f73d96..cb440f1c16 100644 --- a/include/rdc_lib/impl/RdcEmbeddedHandler.h +++ b/include/rdc_lib/impl/RdcEmbeddedHandler.h @@ -92,6 +92,7 @@ class RdcEmbeddedHandler: public RdcHandler { ~RdcEmbeddedHandler(); private: + rdc_status_t get_gpu_gauges(rdc_gpu_gauges_t* gpu_gauges); RdcGroupSettingsPtr group_settings_; RdcCacheManagerPtr cache_mgr_; RdcMetricFetcherPtr metric_fetcher_; diff --git a/include/rdc_lib/impl/RdcMetricFetcherImpl.h b/include/rdc_lib/impl/RdcMetricFetcherImpl.h index cac7ffc0e4..886c58e84e 100644 --- a/include/rdc_lib/impl/RdcMetricFetcherImpl.h +++ b/include/rdc_lib/impl/RdcMetricFetcherImpl.h @@ -22,16 +22,55 @@ THE SOFTWARE. #ifndef RDC_LIB_IMPL_RDCMETRICFETCHERIMPL_H_ #define RDC_LIB_IMPL_RDCMETRICFETCHERIMPL_H_ +#include // NOLINT(build/c++11) +#include // NOLINT(build/c++11) +#include // NOLINT(build/c++11) +#include +#include #include "rdc_lib/RdcMetricFetcher.h" +#include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { +//!< Some metrics, like PCIe throughput may take a second to retreive. The +//!< MetricValue will cache those metrics for async retreive. +struct MetricValue { + uint64_t cache_ttl; + uint64_t last_time; + rdc_field_value value; +}; + + +//!< The data structure to store the async fetch task +class RdcMetricFetcherImpl; +struct MetricTask { + RdcFieldKey field; + std::function task; +}; + class RdcMetricFetcherImpl: public RdcMetricFetcher { public: rdc_status_t fetch_smi_field(uint32_t gpu_index, uint32_t field_id, rdc_field_value* value) override; bool is_field_valid(uint32_t field_id) const override; + RdcMetricFetcherImpl(); + ~RdcMetricFetcherImpl(); + private: + uint64_t now(); + void get_ecc_error(uint32_t gpu_index, + uint32_t field_id, rdc_field_value* value); + void async_get_pcie_throughput(uint32_t gpu_index, + uint32_t field_id, rdc_field_value* value); + void get_pcie_throughput(const RdcFieldKey& key); + + //!< Async metric retreive + std::map async_metrics_; + std::queue updated_tasks_; + std::mutex task_mutex_; + std::future updater_; // keep the future of updater + std::condition_variable cv_; + std::atomic task_started_; }; } // namespace rdc diff --git a/include/rdc_lib/impl/RdcWatchTableImpl.h b/include/rdc_lib/impl/RdcWatchTableImpl.h index 266fd91911..15977f2a59 100644 --- a/include/rdc_lib/impl/RdcWatchTableImpl.h +++ b/include/rdc_lib/impl/RdcWatchTableImpl.h @@ -54,8 +54,10 @@ struct JobWatchTableEntry { class RdcWatchTableImpl : public RdcWatchTable { public: rdc_status_t rdc_job_start_stats(rdc_gpu_group_t group_id, - char job_id[64], uint64_t update_freq) override; - rdc_status_t rdc_job_stop_stats(char job_id[64]) override; + char job_id[64], uint64_t update_freq, + const rdc_gpu_gauges_t& gpu_gauge) override; + rdc_status_t rdc_job_stop_stats(char job_id[64], + const rdc_gpu_gauges_t& gpu_gauge) override; rdc_status_t rdc_job_remove(char job_id[64]) override; rdc_status_t rdc_job_remove_all() override; diff --git a/include/rdc_lib/rdc_common.h b/include/rdc_lib/rdc_common.h index d08ddae6fa..46a1a01e1c 100644 --- a/include/rdc_lib/rdc_common.h +++ b/include/rdc_lib/rdc_common.h @@ -23,6 +23,8 @@ THE SOFTWARE. #ifndef RDC_LIB_RDC_COMMON_H_ #define RDC_LIB_RDC_COMMON_H_ #include +#include +#include #define RDC_ERROR 0 #define RDC_INFO 1 @@ -37,6 +39,12 @@ THE SOFTWARE. } \ } while (0) +// +typedef std::pair RdcFieldKey; + +//!< The gauge metrics do not require aggregations +typedef std::map rdc_gpu_gauges_t; + /** * @brief The strncpy but with null terminated * diff --git a/protos/rdc.proto b/protos/rdc.proto index 00ab6f4cc2..6e9a1b9627 100755 --- a/protos/rdc.proto +++ b/protos/rdc.proto @@ -427,6 +427,12 @@ message GpuUsageInfo { JobStatsSummary gpu_utilization = 7; uint64 max_gpu_memory_used = 8; JobStatsSummary memory_utilization = 9; + uint64 ecc_correct = 10; + uint64 ecc_uncorrect = 11; + JobStatsSummary pcie_tx = 12; + JobStatsSummary pcie_rx = 13; + JobStatsSummary memory_clock = 14; + JobStatsSummary gpu_temperature = 15; } message GetJobStatsResponse { uint32 status = 1; diff --git a/rdc_libs/bootstrap/src/RdcBootStrap.cc b/rdc_libs/bootstrap/src/RdcBootStrap.cc index 0326bd0f26..392b3782de 100644 --- a/rdc_libs/bootstrap/src/RdcBootStrap.cc +++ b/rdc_libs/bootstrap/src/RdcBootStrap.cc @@ -358,6 +358,12 @@ const char* field_id_string(uint32_t field_id) { {RDC_FI_GPU_UTIL, "GPU_UTIL"}, {RDC_FI_GPU_TEMP, "GPU_TEMP"}, {RDC_FI_GPU_COUNT, "GPU_COUNT"}, + {RDC_FI_MEM_CLOCK, "MEM_CLOCK"}, + {RDC_FI_PCIE_TX, "PCIE_TX"}, + {RDC_FI_PCIE_RX, "PCIE_RX"}, + {RDC_FI_ECC_CORRECT_TOTAL, "ECC_CORRECT"}, + {RDC_FI_ECC_UNCORRECT_TOTAL, "ECC_UNCORRECT"}, + {RDC_FI_MEMORY_TEMP, "MEMORY_TEMP"}, {RDC_FI_DEV_NAME, "DEV_NAME"} }; diff --git a/rdc_libs/rdc/src/RdcCacheManagerImpl.cc b/rdc_libs/rdc/src/RdcCacheManagerImpl.cc index b8d503c9c1..e86e0f435d 100644 --- a/rdc_libs/rdc/src/RdcCacheManagerImpl.cc +++ b/rdc_libs/rdc/src/RdcCacheManagerImpl.cc @@ -235,7 +235,11 @@ rdc_status_t RdcCacheManagerImpl::rdc_update_job_stats(uint32_t gpu_index, void RdcCacheManagerImpl::set_summary(const FieldSummaryStats & stats, rdc_stats_summary_t & gpu, rdc_stats_summary_t& summary, unsigned int adjuster) { - if (stats.count == 0) return; + if (stats.count == 0) { + gpu.min_value = std::numeric_limits::max(); + gpu.max_value = gpu.average = 0; + return; + } gpu.max_value = stats.max_value / adjuster; gpu.min_value = stats.min_value / adjuster; @@ -247,7 +251,7 @@ void RdcCacheManagerImpl::set_summary(const FieldSummaryStats & stats, } rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(char jobId[64], - const rdc_gpu_total_memory_t& total_memory, + const rdc_gpu_gauges_t& gpu_gauges, rdc_job_info_t* p_job_info) { std::lock_guard guard(cache_mutex_); auto job_stats = cache_jobs_.find(jobId); @@ -257,6 +261,7 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(char jobId[64], } //< Init the summary info + bool is_job_stopped = (job_stats->second.end_time != 0); RDC_LOG(RDC_DEBUG, "rdc_job_get_stats for job " << jobId); auto& summary_info = p_job_info->summary; summary_info.start_time = job_stats->second.start_time; @@ -267,7 +272,13 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(char jobId[64], } summary_info.energy_consumed = 0; summary_info.max_gpu_memory_used = 0; + summary_info.ecc_correct = 0; + summary_info.ecc_uncorrect = 0; summary_info.power_usage = {0, std::numeric_limits::max(), 0}; + summary_info.pcie_tx = {0, std::numeric_limits::max(), 0}; + summary_info.pcie_rx = {0, std::numeric_limits::max(), 0}; + summary_info.gpu_temperature = {0, std::numeric_limits::max(), 0}; + summary_info.memory_clock = {0, std::numeric_limits::max(), 0}; summary_info.gpu_clock = {0, std::numeric_limits::max(), 0}; summary_info.gpu_utilization = {0, std::numeric_limits::max(), 0}; summary_info.memory_utilization = {0, @@ -285,13 +296,46 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(char jobId[64], gpu_info.energy_consumed = gpus->second.energy_consumed; summary_info.energy_consumed += gpu_info.energy_consumed; + if (is_job_stopped) { + gpu_info.ecc_correct = gpus->second.ecc_correct_init; + summary_info.ecc_correct += gpu_info.ecc_correct; + } else if (gpu_gauges.find({gpus->first, + RDC_FI_ECC_CORRECT_TOTAL}) != gpu_gauges.end()) { + gpu_info.ecc_correct = gpu_gauges.at({ + gpus->first, RDC_FI_ECC_CORRECT_TOTAL}) - + gpus->second.ecc_correct_init; + summary_info.ecc_correct += gpu_info.ecc_correct; + } else { + gpu_info.ecc_correct = 0; + } + + if (is_job_stopped) { + gpu_info.ecc_uncorrect = gpus->second.ecc_uncorrect_init; + summary_info.ecc_uncorrect += gpu_info.ecc_uncorrect; + } else if (gpu_gauges.find({gpus->first, + RDC_FI_ECC_UNCORRECT_TOTAL}) != gpu_gauges.end()) { + gpu_info.ecc_uncorrect = gpu_gauges.at({ + gpus->first, RDC_FI_ECC_UNCORRECT_TOTAL}) - + gpus->second.ecc_uncorrect_init; + summary_info.ecc_uncorrect += gpu_info.ecc_uncorrect; + } else { + gpu_info.ecc_uncorrect = 0; + } + + if (gpu_gauges.find({gpus->first, + RDC_FI_GPU_MEMORY_TOTAL}) == gpu_gauges.end()) { + RDC_LOG(RDC_ERROR, "Cannot find the total memory"); + return RDC_ST_BAD_PARAMETER; + } + uint64_t tmemory = gpu_gauges.at({gpus->first, + RDC_FI_GPU_MEMORY_TOTAL}); + auto ite = gpus->second.field_summaries.begin(); for (; ite != gpus->second.field_summaries.end(); ite++) { if (ite->first == RDC_FI_POWER_USAGE) { set_summary(ite->second, gpu_info.power_usage, summary_info.power_usage, 1000000); } else if (ite->first == RDC_FI_GPU_MEMORY_USAGE) { - auto tmemory = total_memory.at(gpus->first); set_summary(ite->second, gpu_info.memory_utilization, summary_info.memory_utilization, tmemory/100); gpu_info.max_gpu_memory_used = ite->second.max_value; @@ -304,6 +348,18 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(char jobId[64], } else if (ite->first == RDC_FI_GPU_UTIL) { set_summary(ite->second, gpu_info.gpu_utilization, summary_info.gpu_utilization, 1); + } else if (ite->first == RDC_FI_GPU_TEMP) { + set_summary(ite->second, + gpu_info.gpu_temperature, summary_info.gpu_temperature, 1000); + } else if (ite->first == RDC_FI_MEM_CLOCK) { + set_summary(ite->second, + gpu_info.memory_clock, summary_info.memory_clock, 1000000); + } else if (ite->first == RDC_FI_PCIE_TX) { + set_summary(ite->second, + gpu_info.pcie_tx, summary_info.pcie_tx, 1024*1024); + } else if (ite->first == RDC_FI_PCIE_RX) { + set_summary(ite->second, + gpu_info.pcie_rx, summary_info.pcie_rx, 1024*1024); } } } @@ -316,12 +372,21 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(char jobId[64], p_job_info->num_gpus; summary_info.memory_utilization.average = summary_info.memory_utilization.average/p_job_info->num_gpus; + summary_info.pcie_tx.average = summary_info.pcie_tx.average/ + p_job_info->num_gpus; + summary_info.pcie_rx.average = summary_info.pcie_rx.average/ + p_job_info->num_gpus; + summary_info.gpu_temperature.average = summary_info.gpu_temperature.average/ + p_job_info->num_gpus; + summary_info.memory_clock.average = summary_info.memory_clock.average/ + p_job_info->num_gpus; return RDC_ST_OK; } rdc_status_t RdcCacheManagerImpl::rdc_job_start_stats(char job_id[64], - const rdc_group_info_t& ginfo, const rdc_field_group_info_t& finfo) { + const rdc_group_info_t& ginfo, const rdc_field_group_info_t& finfo, + const rdc_gpu_gauges_t& gpu_gauges) { RdcJobStatsCacheEntry cacheEntry; cacheEntry.start_time = std::time(nullptr); cacheEntry.end_time = 0; @@ -336,6 +401,20 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_start_stats(char job_id[64], gstats.field_summaries.insert({finfo.field_ids[j], s}); } + gstats.ecc_correct_init = 0; + if (gpu_gauges.find({ginfo.entity_ids[i], RDC_FI_ECC_CORRECT_TOTAL}) != + gpu_gauges.end()) { + gstats.ecc_correct_init = gpu_gauges.at( + {ginfo.entity_ids[i], RDC_FI_ECC_CORRECT_TOTAL}); + } + + gstats.ecc_uncorrect_init = 0; + if (gpu_gauges.find({ginfo.entity_ids[i], RDC_FI_ECC_UNCORRECT_TOTAL}) != + gpu_gauges.end()) { + gstats.ecc_uncorrect_init = gpu_gauges.at( + {ginfo.entity_ids[i], RDC_FI_ECC_UNCORRECT_TOTAL}); + } + cacheEntry.gpu_stats.insert({ginfo.entity_ids[i], gstats}); } @@ -347,7 +426,8 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_start_stats(char job_id[64], } -rdc_status_t RdcCacheManagerImpl::rdc_job_stop_stats(char job_id[64]) { +rdc_status_t RdcCacheManagerImpl::rdc_job_stop_stats(char job_id[64], + const rdc_gpu_gauges_t& gpu_gauges) { std::lock_guard guard(cache_mutex_); auto job_stats = cache_jobs_.find(job_id); @@ -357,6 +437,24 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_stop_stats(char job_id[64]) { job_stats->second.end_time = std::time(nullptr); + // update the ecc errors + auto gpus = job_stats->second.gpu_stats.begin(); + for (; gpus != job_stats->second.gpu_stats.end(); gpus++) { + if (gpu_gauges.find({gpus->first, + RDC_FI_ECC_CORRECT_TOTAL}) != gpu_gauges.end()) { + gpus->second.ecc_correct_init = gpu_gauges.at({ + gpus->first, RDC_FI_ECC_CORRECT_TOTAL}) - + gpus->second.ecc_correct_init; + } + + if (gpu_gauges.find({gpus->first, + RDC_FI_ECC_UNCORRECT_TOTAL}) != gpu_gauges.end()) { + gpus->second.ecc_uncorrect_init = gpu_gauges.at({ + gpus->first, RDC_FI_ECC_UNCORRECT_TOTAL}) - + gpus->second.ecc_uncorrect_init; + } + } + return RDC_ST_OK; } diff --git a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc index fa5e3d3dc1..bd02f1b7d6 100644 --- a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc +++ b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc @@ -86,38 +86,75 @@ RdcEmbeddedHandler::~RdcEmbeddedHandler() { // JOB API rdc_status_t RdcEmbeddedHandler::rdc_job_start_stats(rdc_gpu_group_t groupId, char job_id[64], uint64_t update_freq) { - return watch_table_->rdc_job_start_stats(groupId, job_id, update_freq); + rdc_gpu_gauges_t gpu_gauges; + rdc_status_t status = get_gpu_gauges(&gpu_gauges); + if (status != RDC_ST_OK) return status; + + return watch_table_->rdc_job_start_stats(groupId, job_id, update_freq, + gpu_gauges); } -rdc_status_t RdcEmbeddedHandler::rdc_job_get_stats(char job_id[64], - rdc_job_info_t* p_job_info) { +rdc_status_t RdcEmbeddedHandler::get_gpu_gauges(rdc_gpu_gauges_t* gpu_gauges) { uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES]; uint32_t count = 0; + + if (gpu_gauges == nullptr) { + return RDC_ST_BAD_PARAMETER; + } rdc_status_t status = rdc_device_get_all( gpu_index_list, &count); if (status != RDC_ST_OK) { return status; } - rdc_gpu_total_memory_t all_total_memory; - + // Fetch total memory and current ecc errors for (uint32_t i = 0; i < count ; i++) { - rdc_field_value total_memory; + rdc_field_value value; status = metric_fetcher_->fetch_smi_field(gpu_index_list[i], - RDC_FI_GPU_MEMORY_TOTAL, &total_memory); + RDC_FI_GPU_MEMORY_TOTAL, &value); if (status != RDC_ST_OK) { RDC_LOG(RDC_ERROR, "Fail to get total memory of GPU " << gpu_index_list[i]); return status; } - all_total_memory.insert({gpu_index_list[i], total_memory.value.l_int}); + gpu_gauges->insert({{gpu_index_list[i], RDC_FI_GPU_MEMORY_TOTAL}, + value.value.l_int}); + + status = metric_fetcher_->fetch_smi_field(gpu_index_list[i], + RDC_FI_ECC_CORRECT_TOTAL, &value); + if (status == RDC_ST_OK) { + gpu_gauges->insert({{gpu_index_list[i], RDC_FI_ECC_CORRECT_TOTAL}, + value.value.l_int}); + } + status = metric_fetcher_->fetch_smi_field(gpu_index_list[i], + RDC_FI_ECC_UNCORRECT_TOTAL, &value); + if (status == RDC_ST_OK) { + gpu_gauges->insert({{gpu_index_list[i], RDC_FI_ECC_UNCORRECT_TOTAL}, + value.value.l_int}); + } + } + return RDC_ST_OK; +} + +rdc_status_t RdcEmbeddedHandler::rdc_job_get_stats(char job_id[64], + rdc_job_info_t* p_job_info) { + if (p_job_info == nullptr) { + return RDC_ST_BAD_PARAMETER; } - return cache_mgr_->rdc_job_get_stats(job_id, all_total_memory, p_job_info); + rdc_gpu_gauges_t gpu_gauges; + rdc_status_t status = get_gpu_gauges(&gpu_gauges); + if (status != RDC_ST_OK) return status; + + return cache_mgr_->rdc_job_get_stats(job_id, gpu_gauges, p_job_info); } rdc_status_t RdcEmbeddedHandler::rdc_job_stop_stats(char job_id[64]) { - return watch_table_->rdc_job_stop_stats(job_id); + rdc_gpu_gauges_t gpu_gauges; + rdc_status_t status = get_gpu_gauges(&gpu_gauges); + if (status != RDC_ST_OK) return status; + + return watch_table_->rdc_job_stop_stats(job_id, gpu_gauges); } rdc_status_t RdcEmbeddedHandler::rdc_job_remove(char job_id[64]) { diff --git a/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc b/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc index eb89ddb505..9ba40b7130 100644 --- a/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc +++ b/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc @@ -30,7 +30,9 @@ namespace rdc { RdcGroupSettingsImpl::RdcGroupSettingsImpl() { // Add the default job stats fields uint32_t job_fields[] = {RDC_FI_GPU_MEMORY_USAGE, - RDC_FI_POWER_USAGE, RDC_FI_GPU_SM_CLOCK, RDC_FI_GPU_UTIL}; + RDC_FI_POWER_USAGE, RDC_FI_GPU_SM_CLOCK, RDC_FI_GPU_UTIL, + RDC_FI_PCIE_TX, RDC_FI_PCIE_RX, RDC_FI_MEM_CLOCK, + RDC_FI_GPU_TEMP}; char job_field_group[] = "JobStatsFields"; rdc_field_grp_t fgid = JOB_FIELD_ID; diff --git a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index 114510b8e5..34200ced8a 100644 --- a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -35,18 +35,192 @@ namespace rdc { bool RdcMetricFetcherImpl::is_field_valid(uint32_t field_id) const { const std::vector all_fields = {RDC_FI_GPU_MEMORY_USAGE, RDC_FI_GPU_MEMORY_TOTAL, RDC_FI_GPU_COUNT, RDC_FI_POWER_USAGE, - RDC_FI_GPU_SM_CLOCK, RDC_FI_GPU_UTIL, RDC_FI_DEV_NAME, RDC_FI_GPU_TEMP}; + RDC_FI_GPU_SM_CLOCK, RDC_FI_GPU_UTIL, RDC_FI_DEV_NAME, RDC_FI_GPU_TEMP, + RDC_FI_MEM_CLOCK, RDC_FI_PCIE_TX, RDC_FI_PCIE_RX, + RDC_FI_ECC_CORRECT_TOTAL, RDC_FI_ECC_UNCORRECT_TOTAL, RDC_FI_MEMORY_TEMP}; return std::find(all_fields.begin(), all_fields.end(), field_id) != all_fields.end(); } +RdcMetricFetcherImpl::RdcMetricFetcherImpl() { + task_started_ = true; + + // kick off another thread for async fetch + updater_ = std::async(std::launch::async, [this]() { + while (task_started_) { + std::unique_lock lk(task_mutex_); + // Wait for tasks or stop signal + cv_.wait(lk, [this]{ + return !updated_tasks_.empty() || !task_started_; + }); + if (updated_tasks_.empty()) continue; + + // Get the tasks + auto item = updated_tasks_.front(); + updated_tasks_.pop(); + // The task may take long time, release lock + lk.unlock(); + + // run task + item.task(*this, item.field); + } // end while (task_started_) + }); +} + +RdcMetricFetcherImpl::~RdcMetricFetcherImpl() { + // Notify the async task to stop + task_started_ = false; + cv_.notify_all(); +} + +uint64_t RdcMetricFetcherImpl::now() { + struct timeval tv; + gettimeofday(&tv, NULL); + return static_cast(tv.tv_sec) * 1000 + tv.tv_usec / 1000; +} + +void RdcMetricFetcherImpl::get_ecc_error(uint32_t gpu_index, + uint32_t field_id, rdc_field_value* value) { + rsmi_status_t err = RSMI_STATUS_SUCCESS; + uint64_t correctable_err = 0; + uint64_t uncorrectable_err = 0; + rsmi_ras_err_state_t err_state; + + if (!value) { + return; + } + for (uint32_t b = RSMI_GPU_BLOCK_FIRST; + b <= RSMI_GPU_BLOCK_LAST; b = b*2) { + err = rsmi_dev_ecc_status_get(gpu_index, static_cast(b), + &err_state); + if (err != RSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_INFO, "Get the ecc Status error " << b + << ":" << err); + continue; + } + + rsmi_error_count_t ec; + err = rsmi_dev_ecc_count_get(gpu_index, + static_cast(b), &ec); + + if (err == RSMI_STATUS_SUCCESS) { + correctable_err += ec.correctable_err; + uncorrectable_err += ec.uncorrectable_err; + } + } + + value->status = RSMI_STATUS_SUCCESS; + value->type = INTEGER; + if (field_id == RDC_FI_ECC_CORRECT_TOTAL) { + value->value.l_int = correctable_err; + } + if (field_id == RDC_FI_ECC_UNCORRECT_TOTAL) { + value->value.l_int = uncorrectable_err; + } +} + +void RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index, + uint32_t field_id, rdc_field_value* value) { + if (!value) { + return; + } + + do { + std::lock_guard guard(task_mutex_); + auto metric = async_metrics_.find({gpu_index, field_id}); + if ( metric != async_metrics_.end() ) { + if (now() < metric->second.last_time + metric->second.cache_ttl) { + RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" << + field_id_string(field_id) << " from cache"); + value->status = metric->second.value.status; + value->type = metric->second.value.type; + value->value = metric->second.value.value; + return; + } + } + + // add to the async task queue + MetricTask t; + t.field = {gpu_index, field_id}; + t.task = &RdcMetricFetcherImpl::get_pcie_throughput; + updated_tasks_.push(t); + + RDC_LOG(RDC_DEBUG, "Start async fetch " << gpu_index << ":" << + field_id_string(field_id) << " to cache."); + } while (0); + cv_.notify_all(); +} + +void RdcMetricFetcherImpl::get_pcie_throughput(const RdcFieldKey& key) { + uint32_t gpu_index = key.first; + uint64_t sent, received, max_pkt_sz; + rsmi_status_t ret; + + // Return if the cache does not expire yet + do { + std::lock_guard guard(task_mutex_); + auto metric = async_metrics_.find(key); + if (metric != async_metrics_.end() && + now() < metric->second.last_time + metric->second.cache_ttl) { + return; + } + } while (0); + + ret = rsmi_dev_pci_throughput_get(gpu_index, &sent, &received, &max_pkt_sz); + + uint64_t curTime = now(); + MetricValue value; + value.cache_ttl = 30*1000; // cache 30 seconds + value.value.type = INTEGER; + do { + std::lock_guard guard(task_mutex_); + // Create new cache entry it does not exist + auto tx_metric = async_metrics_.find({gpu_index, RDC_FI_PCIE_TX}); + if (tx_metric == async_metrics_.end()) { + tx_metric = async_metrics_.insert( + {{gpu_index, RDC_FI_PCIE_TX}, value}).first; + tx_metric->second.value.field_id = RDC_FI_PCIE_TX; + } + auto rx_metric = async_metrics_.find({gpu_index, RDC_FI_PCIE_RX}); + if (rx_metric == async_metrics_.end()) { + rx_metric = async_metrics_.insert( + {{gpu_index, RDC_FI_PCIE_RX}, value}).first; + rx_metric->second.value.field_id = RDC_FI_PCIE_RX; + } + + // Always update the status and last_time + tx_metric->second.last_time = curTime; + tx_metric->second.value.status = ret; + tx_metric->second.value.ts = curTime; + + rx_metric->second.last_time = curTime; + rx_metric->second.value.status = ret; + rx_metric->second.value.ts = curTime; + + if (ret == RSMI_STATUS_NOT_SUPPORTED) { + RDC_LOG(RDC_ERROR, + "PCIe throughput not supported on GPU " << gpu_index); + return; + } + + if (ret == RSMI_STATUS_SUCCESS) { + rx_metric->second.value.value.l_int = received; + tx_metric->second.value.value.l_int = sent; + RDC_LOG(RDC_DEBUG, "Async updated " << gpu_index << ":" << + "RDC_FI_PCIE_RX and RDC_FI_PCIE_TX to cache."); + } + } while (0); +} + rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, uint32_t field_id, rdc_field_value* value) { if (!value) { return RDC_ST_BAD_PARAMETER; } uint64_t i64 = 0; + rsmi_temperature_type_t sensor_type; + rsmi_clk_type_t clk_type; if (!is_field_valid(field_id)) { RDC_LOG(RDC_ERROR, "Fail to fetch field " << field_id @@ -54,9 +228,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, return RDC_ST_NOT_SUPPORTED; } - struct timeval tv; - gettimeofday(&tv, NULL); - value->ts = static_cast(tv.tv_sec) * 1000 + tv.tv_usec / 1000; + value->ts = now(); value->field_id = field_id; value->status = RSMI_STATUS_NOT_SUPPORTED; @@ -94,9 +266,14 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, } break; case RDC_FI_GPU_SM_CLOCK: + case RDC_FI_MEM_CLOCK: rsmi_frequencies_t f; + clk_type = RSMI_CLK_TYPE_SYS; + if (field_id == RDC_FI_MEM_CLOCK) { + clk_type = RSMI_CLK_TYPE_MEM; + } value->status = rsmi_dev_gpu_clk_freq_get(gpu_index, - RSMI_CLK_TYPE_SYS, &f); + clk_type, &f); value->type = INTEGER; if (value->status == RSMI_STATUS_SUCCESS) { value->value.l_int = f.frequency[f.current]; @@ -116,21 +293,33 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, value->type = STRING; break; case RDC_FI_GPU_TEMP: + case RDC_FI_MEMORY_TEMP: int64_t val_i64; + sensor_type = RSMI_TEMP_TYPE_EDGE; + if (field_id == RDC_FI_MEMORY_TEMP) { + sensor_type = RSMI_TEMP_TYPE_MEMORY; + } value->status = rsmi_dev_temp_metric_get(gpu_index, - 0, RSMI_TEMP_CURRENT, &val_i64); + sensor_type , RSMI_TEMP_CURRENT, &val_i64); + value->type = INTEGER; if (value->status == RSMI_STATUS_SUCCESS) { value->value.l_int = val_i64; } break; + case RDC_FI_ECC_CORRECT_TOTAL: + case RDC_FI_ECC_UNCORRECT_TOTAL: + get_ecc_error(gpu_index, field_id, value); + break; + case RDC_FI_PCIE_TX: + case RDC_FI_PCIE_RX: + async_get_pcie_throughput(gpu_index, field_id, value); + break; default: break; } - gettimeofday(&tv, NULL); - int64_t latency = static_cast(tv.tv_sec)*1000+tv.tv_usec/1000 - - value->ts; + int64_t latency = now()-value->ts; if (value->status != RSMI_STATUS_SUCCESS) { RDC_LOG(RDC_ERROR, "Fail to fetch " << gpu_index << ":" << field_id_string(field_id) << " with rsmi error code " diff --git a/rdc_libs/rdc/src/RdcWatchTableImpl.cc b/rdc_libs/rdc/src/RdcWatchTableImpl.cc index 7134a51992..7ed2663ae6 100644 --- a/rdc_libs/rdc/src/RdcWatchTableImpl.cc +++ b/rdc_libs/rdc/src/RdcWatchTableImpl.cc @@ -41,7 +41,8 @@ RdcWatchTableImpl::RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings, } rdc_status_t RdcWatchTableImpl::rdc_job_start_stats(rdc_gpu_group_t group_id, - char job_id[64], uint64_t update_freq) { + char job_id[64], uint64_t update_freq, + const rdc_gpu_gauges_t& gpu_gauges) { do { //< lock guard for thread safe std::lock_guard guard(watch_mutex_); if (job_watch_table_.find(job_id) != job_watch_table_.end()) { @@ -67,10 +68,6 @@ rdc_status_t RdcWatchTableImpl::rdc_job_start_stats(rdc_gpu_group_t group_id, job_watch_table_.insert({job_id, jentry}); } while (0); - result = rdc_field_watch(group_id, JOB_FIELD_ID, update_freq, 0, 0); - if (result != RDC_ST_OK) { - return result; - } rdc_field_group_info_t finfo; rdc_group_info_t ginfo; @@ -85,12 +82,18 @@ rdc_status_t RdcWatchTableImpl::rdc_job_start_stats(rdc_gpu_group_t group_id, return result; } - result = cache_mgr_->rdc_job_start_stats(job_id, ginfo, finfo); + result = cache_mgr_->rdc_job_start_stats(job_id, ginfo, finfo, gpu_gauges); + if (result != RDC_ST_OK) { + return result; + } + // At last, when every thing sets up, starts to watch the fields. + result = rdc_field_watch(group_id, JOB_FIELD_ID, update_freq, 0, 0); return result; } -rdc_status_t RdcWatchTableImpl::rdc_job_stop_stats(char job_id[64]) { +rdc_status_t RdcWatchTableImpl::rdc_job_stop_stats(char job_id[64], + const rdc_gpu_gauges_t& gpu_gauge) { uint32_t job_group_id; do { //< lock guard for thread safe std::lock_guard guard(watch_mutex_); @@ -111,13 +114,14 @@ rdc_status_t RdcWatchTableImpl::rdc_job_stop_stats(char job_id[64]) { job_watch_table_.erase(job_id); } while (0); - result = cache_mgr_->rdc_job_stop_stats(job_id); + result = cache_mgr_->rdc_job_stop_stats(job_id, gpu_gauge); return result; } rdc_status_t RdcWatchTableImpl::rdc_job_remove(char job_id[64]) { - rdc_job_stop_stats(job_id); + rdc_gpu_gauges_t gpu_gauge; + rdc_job_stop_stats(job_id, gpu_gauge); return cache_mgr_->rdc_job_remove(job_id); } @@ -134,7 +138,8 @@ rdc_status_t RdcWatchTableImpl::rdc_job_remove_all() { // Stop them for (auto job = v.begin(); job != v.end(); job++) { - rdc_job_stop_stats(const_cast(job->c_str())); + rdc_gpu_gauges_t gpu_gauge; + rdc_job_stop_stats(const_cast(job->c_str()), gpu_gauge); } return cache_mgr_->rdc_job_remove_all(); @@ -340,8 +345,9 @@ rdc_status_t RdcWatchTableImpl::rdc_field_update_all() { auto fite = fields_to_watch_.begin(); for (; fite != fields_to_watch_.end(); fite++) { // Is this field need to be updated? + uint64_t track_freq = fite->second.update_freq/1000; if (!fite->second.is_watching || - fite->second.last_update_time+fite->second.update_freq/1000 > now) { + fite->second.last_update_time+track_freq > now) { continue; } @@ -350,6 +356,10 @@ rdc_status_t RdcWatchTableImpl::rdc_field_update_all() { result = metric_fetcher_->fetch_smi_field( fite->first.first, fite->first.second, &value); if (result != RDC_ST_OK) { + // To prevent frequently retry when error, update the time + gettimeofday(&tv, NULL); + now = static_cast(tv.tv_sec)*1000+tv.tv_usec/1000; + fite->second.last_update_time = now; continue; } diff --git a/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc b/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc index d6a845a708..12d63318ef 100644 --- a/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc +++ b/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc @@ -88,6 +88,8 @@ bool RdcStandaloneHandler::copy_gpu_usage_info( target->end_time = src.end_time(); target->energy_consumed = src.energy_consumed(); target->max_gpu_memory_used = src.max_gpu_memory_used(); + target->ecc_correct = src.ecc_correct(); + target->ecc_uncorrect = src.ecc_uncorrect(); const ::rdc::JobStatsSummary& pstats = src.power_usage(); target->power_usage.max_value = pstats.max_value(); @@ -109,6 +111,26 @@ bool RdcStandaloneHandler::copy_gpu_usage_info( target->memory_utilization.min_value = mstats.min_value(); target->memory_utilization.average = mstats.average(); + const ::rdc::JobStatsSummary& txstats = src.pcie_tx(); + target->pcie_tx.max_value = txstats.max_value(); + target->pcie_tx.min_value = txstats.min_value(); + target->pcie_tx.average = txstats.average(); + + const ::rdc::JobStatsSummary& rxstats = src.pcie_rx(); + target->pcie_rx.max_value = rxstats.max_value(); + target->pcie_rx.min_value = rxstats.min_value(); + target->pcie_rx.average = rxstats.average(); + + const ::rdc::JobStatsSummary& mcstats = src.memory_clock(); + target->memory_clock.max_value = mcstats.max_value(); + target->memory_clock.min_value = mcstats.min_value(); + target->memory_clock.average = mcstats.average(); + + const ::rdc::JobStatsSummary& gtstats = src.gpu_temperature(); + target->gpu_temperature.max_value = gtstats.max_value(); + target->gpu_temperature.min_value = gtstats.min_value(); + target->gpu_temperature.average = gtstats.average(); + return true; } rdc_status_t RdcStandaloneHandler::rdc_job_get_stats(char job_id[64], diff --git a/rdci/src/RdciDmonSubSystem.cc b/rdci/src/RdciDmonSubSystem.cc index 0c477996f7..371eb23d0d 100644 --- a/rdci/src/RdciDmonSubSystem.cc +++ b/rdci/src/RdciDmonSubSystem.cc @@ -278,10 +278,19 @@ void RdciDmonSubSystem::create_temp_field_group() { void RdciDmonSubSystem::show_field_usage() const { std::cout << "Supported fields Ids:\n"; std::cout << "100 RDC_FI_GPU_SM_CLOCK: Current GPU clock frequencies.\n"; + std::cout << "101 RDC_FI_MEM_CLOCK: Current Memory clock frequencies.\n"; + std::cout << "140 RDC_FI_MEMORY_TEMP: Memory " + << "temperature in millidegrees Celsius.\n"; std::cout << "150 RDC_FI_GPU_TEMP: GPU " - << "temperature in millidegrees Celcius.\n"; + << "temperature in millidegrees Celsius.\n"; std::cout << "155 RDC_FI_POWER_USAGE: Power usage in microwatts.\n"; + std::cout << "200 RDC_FI_PCIE_TX: PCIe Tx utilization in bytes/second.\n"; + std::cout << "201 RDC_FI_PCIE_RX: PCIe Rx utilization in bytes/second.\n"; std::cout << "203 RDC_FI_GPU_UTIL: GPU busy percentage.\n"; + std::cout << "312 RDC_FI_ECC_CORRECT_TOTAL: Accumulated " + << "correctable ECC errors.\n"; + std::cout << "313 RDC_FI_ECC_UNCORRECT_TOTAL: Accumulated " + << "uncorrectable ECC errors.\n"; std::cout << "525 RDC_FI_GPU_MEMORY_USAGE: Memory usage of the GPU " << "instance in bytes.\n"; } @@ -361,7 +370,7 @@ void RdciDmonSubSystem::process() { group_info.entity_ids[gindex], field_info.field_ids[findex], &value); if (result != RDC_ST_OK) { - std::cout << std::left << std::setw(20) << "error"; + std::cout << std::left << std::setw(20) << "N/A"; } else { if (value.type == INTEGER) { std::cout << std::left << std::setw(20) diff --git a/rdci/src/RdciGroupSubSystem.cc b/rdci/src/RdciGroupSubSystem.cc index c105333217..2c7a159f6f 100644 --- a/rdci/src/RdciGroupSubSystem.cc +++ b/rdci/src/RdciGroupSubSystem.cc @@ -82,7 +82,10 @@ void RdciGroupSubSystem::parse_cmd_opts(int argc, char ** argv) { group_name_ = optarg; break; case 'a': - group_ops_ = GROUP_ADD_GPUS; + // Create may add GPUs as well. + if (group_ops_ != GROUP_CREATE) { + group_ops_ = GROUP_ADD_GPUS; + } gpu_ids_ = optarg; break; case 'i': @@ -116,7 +119,8 @@ void RdciGroupSubSystem::show_help() const { std::cout << " group -- Used to create and maintain groups of GPUs.\n\n"; std::cout << "Usage\n"; std::cout << " rdci group [--host :port] [-u] -l\n"; - std::cout << " rdci group [--host :port] [-u] -c \n"; + std::cout << " rdci group [--host :port] [-u] -c " + << "[-a ]\n"; std::cout << " rdci group [--host :port] [-u] -g " << "[-a ]\n"; std::cout << " rdci group [--host :port] [-u] " @@ -157,6 +161,25 @@ void RdciGroupSubSystem::process() { rdc_gpu_group_t group_id; result = rdc_group_gpu_create(rdc_handle_, RDC_GROUP_EMPTY, group_name_.c_str(), &group_id); + if (result != RDC_ST_OK) { + throw RdcException(result, "Fail to create group " + + group_name_); + } + + gpu_ids = split_string(gpu_ids_, ','); + for (uint32_t i = 0; i < gpu_ids.size(); i++) { + if (!IsNumber(gpu_ids[i])) { + throw RdcException(RDC_ST_BAD_PARAMETER, + "The GPU Id "+gpu_ids[i]+" needs to be a number"); + } + result = rdc_group_gpu_add(rdc_handle_, + group_id, std::stoi(gpu_ids[i])); + if (result != RDC_ST_OK) { + throw RdcException(result, "Fail to add GPU " + + gpu_ids[i] + " to the group"); + } + } + if (result == RDC_ST_OK) { std::cout << "Successfully created group with a group ID " << group_id << std::endl; @@ -214,7 +237,7 @@ void RdciGroupSubSystem::process() { for (uint32_t i = 0; i < gpu_ids.size(); i++) { if (!IsNumber(gpu_ids[i])) { throw RdcException(RDC_ST_BAD_PARAMETER, - "The GUP Id "+gpu_ids[i]+" needs to be a number"); + "The GPU Id "+gpu_ids[i]+" needs to be a number"); } result = rdc_group_gpu_add(rdc_handle_, group_id_, std::stoi(gpu_ids[i])); diff --git a/rdci/src/RdciStatsSubSystem.cc b/rdci/src/RdciStatsSubSystem.cc index 608237932e..d8f905c7e7 100644 --- a/rdci/src/RdciStatsSubSystem.cc +++ b/rdci/src/RdciStatsSubSystem.cc @@ -150,11 +150,11 @@ void RdciStatsSubSystem::show_job_stats( const rdc_gpu_usage_info_t& gpu_info) const { std::cout << "|------- Execution Stats ----------" << "+------------------------------------\n"; - std::cout << "| Start Time * | " + std::cout << "| Start Time | " << gpu_info.start_time << "\n"; - std::cout << "| End Time * | " + std::cout << "| End Time | " << gpu_info.end_time << "\n"; - std::cout << "| Total Execution Time (sec) * | " + std::cout << "| Total Execution Time (sec) | " << (gpu_info.end_time-gpu_info.start_time) << "\n"; std::cout << "+------- Performance Stats --------" << "+------------------------------------\n"; @@ -168,16 +168,36 @@ void RdciStatsSubSystem::show_job_stats( << gpu_info.gpu_clock.max_value << " Min: " << gpu_info.gpu_clock.min_value << " Avg: " << gpu_info.gpu_clock.average << "\n"; + std::cout << "| Memory Clock (MHz) | " << "Max: " + << gpu_info.memory_clock.max_value << " Min: " << + gpu_info.memory_clock.min_value << " Avg: " + << gpu_info.memory_clock.average << "\n"; std::cout << "| SM Utilization (%) | " << "Max: " << gpu_info.gpu_utilization.max_value <<" Min: " << gpu_info.gpu_utilization.min_value << " Avg: " << gpu_info.gpu_utilization.average << "\n"; - std::cout << "| Max GPU Memory Used (bytes) * | " << + std::cout << "| Max GPU Memory Used (bytes) | " << gpu_info.max_gpu_memory_used << "\n"; std::cout << "| Memory Utilization (%) | " << "Max: " << gpu_info.memory_utilization.max_value - <<" Min: "<< gpu_info.memory_utilization.min_value - << " Avg: " << gpu_info.memory_utilization.average << "\n"; + <<" Min: "<< gpu_info.memory_utilization.min_value + << " Avg: " << gpu_info.memory_utilization.average << "\n"; + std::cout << "| GPU Temperature (Celsius) | " + << "Max: " << gpu_info.gpu_temperature.max_value + <<" Min: "<< gpu_info.gpu_temperature.min_value + << " Avg: " << gpu_info.gpu_temperature.average << "\n"; + std::cout << "| PCIe Rx Bandwidth (megabytes) | " + << "Max: " << gpu_info.pcie_rx.max_value + <<" Min: "<< gpu_info.pcie_rx.min_value + << " Avg: " << gpu_info.pcie_rx.average << "\n"; + std::cout << "| PCIe Tx Bandwidth (megabytes) | " + << "Max: " << gpu_info.pcie_tx.max_value + <<" Min: "<< gpu_info.pcie_tx.min_value + << " Avg: " << gpu_info.pcie_tx.average << "\n"; + std::cout << "| Correctable ECC Errors | " + << gpu_info.ecc_correct << "\n"; + std::cout << "| Uncorrectable ECC Errors | " + << gpu_info.ecc_uncorrect << "\n"; std::cout << "+----------------------------------" << "+------------------------------------\n"; } diff --git a/server/src/rdc_api_service.cc b/server/src/rdc_api_service.cc index 8cd10dc046..e7e3119181 100755 --- a/server/src/rdc_api_service.cc +++ b/server/src/rdc_api_service.cc @@ -453,6 +453,11 @@ RdcAPIServiceImpl::~RdcAPIServiceImpl() { const_cast(request->job_id().c_str()), &job_info); + reply->set_status(result); + if (result != RDC_ST_OK) { + return ::grpc::Status::OK; + } + reply->set_num_gpus(job_info.num_gpus); ::rdc::GpuUsageInfo* sinfo = reply->mutable_summary(); copy_gpu_usage_info(job_info.summary, sinfo); @@ -462,8 +467,6 @@ RdcAPIServiceImpl::~RdcAPIServiceImpl() { copy_gpu_usage_info(job_info.gpus[i], ginfo); } - reply->set_status(result); - return ::grpc::Status::OK; } @@ -478,6 +481,8 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, target->set_end_time(src.end_time); target->set_energy_consumed(src.energy_consumed); target->set_max_gpu_memory_used(src.max_gpu_memory_used); + target->set_ecc_correct(src.ecc_correct); + target->set_ecc_uncorrect(src.ecc_uncorrect); ::rdc::JobStatsSummary* stats = target->mutable_power_usage(); stats->set_max_value(src.power_usage.max_value); @@ -499,6 +504,25 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, stats->set_min_value(src.memory_utilization.min_value); stats->set_average(src.memory_utilization.average); + stats = target->mutable_pcie_tx(); + stats->set_max_value(src.pcie_tx.max_value); + stats->set_min_value(src.pcie_tx.min_value); + stats->set_average(src.pcie_tx.average); + + stats = target->mutable_pcie_rx(); + stats->set_max_value(src.pcie_rx.max_value); + stats->set_min_value(src.pcie_rx.min_value); + stats->set_average(src.pcie_rx.average); + + stats = target->mutable_memory_clock(); + stats->set_max_value(src.memory_clock.max_value); + stats->set_min_value(src.memory_clock.min_value); + stats->set_average(src.memory_clock.average); + + stats = target->mutable_gpu_temperature(); + stats->set_max_value(src.gpu_temperature.max_value); + stats->set_min_value(src.gpu_temperature.min_value); + stats->set_average(src.gpu_temperature.average); return true; }