diff --git a/include/rdc/rdc.h b/include/rdc/rdc.h index f429342727..c9de84715d 100755 --- a/include/rdc/rdc.h +++ b/include/rdc/rdc.h @@ -144,16 +144,48 @@ typedef enum { */ #define RDC_FI_GPU_SM_CLOCK 100 +/** + * Clock for the memory + */ +#define RDC_FI_MEM_CLOCK 101 + +/** + * PCIe Tx utilization information + */ +#define RDC_FI_PCIE_TX 200 + +/** + * PCIe Rx utilization information + */ +#define RDC_FI_PCIE_RX 201 + + /** * GPU Utilization */ #define RDC_FI_GPU_UTIL 203 +/** + * Accumulated correctable ECC errors + */ +#define RDC_FI_ECC_CORRECT_TOTAL 312 + +/** + * Accumulated uncorrectable ECC errors + */ +#define RDC_FI_ECC_UNCORRECT_TOTAL 313 + +/** + * Memory temperature for the device + */ +#define RDC_FI_MEMORY_TEMP 140 + /** * Current temperature for the device */ #define RDC_FI_GPU_TEMP 150 + /** * GPU count in the system */ @@ -209,9 +241,15 @@ typedef struct { uint64_t end_time; //!< The time to stop the watching uint64_t energy_consumed; + uint64_t ecc_correct; + uint64_t ecc_uncorrect; + rdc_stats_summary_t pcie_tx; + rdc_stats_summary_t pcie_rx; rdc_stats_summary_t power_usage; rdc_stats_summary_t gpu_clock; + rdc_stats_summary_t memory_clock; rdc_stats_summary_t gpu_utilization; + rdc_stats_summary_t gpu_temperature; uint64_t max_gpu_memory_used; rdc_stats_summary_t memory_utilization; diff --git a/include/rdc_lib/RdcCacheManager.h b/include/rdc_lib/RdcCacheManager.h index 77eab78123..0883bf8f7f 100644 --- a/include/rdc_lib/RdcCacheManager.h +++ b/include/rdc_lib/RdcCacheManager.h @@ -32,7 +32,6 @@ THE SOFTWARE. namespace amd { namespace rdc { -typedef std::map rdc_gpu_total_memory_t; class RdcCacheManager { public: @@ -48,12 +47,14 @@ class RdcCacheManager { virtual std::string get_cache_stats() = 0; virtual rdc_status_t rdc_job_get_stats(char jobId[64], - const rdc_gpu_total_memory_t& total_memory, + const rdc_gpu_gauges_t& gpu_gauges, rdc_job_info_t* p_job_info) = 0; virtual rdc_status_t rdc_job_start_stats(char jobId[64], const rdc_group_info_t& group, - const rdc_field_group_info_t& finfo) = 0; - virtual rdc_status_t rdc_job_stop_stats(char job_id[64]) = 0; + const rdc_field_group_info_t& finfo, + const rdc_gpu_gauges_t& gpu_gauges) = 0; + virtual rdc_status_t rdc_job_stop_stats(char job_id[64], + const rdc_gpu_gauges_t& gpu_gauge) = 0; virtual rdc_status_t rdc_update_job_stats(uint32_t gpu_index, const std::string& job_id, const rdc_field_value& value) = 0; virtual rdc_status_t rdc_job_remove(char job_id[64]) = 0; @@ -64,8 +65,6 @@ class RdcCacheManager { typedef std::shared_ptr RdcCacheManagerPtr; -// -typedef std::pair RdcFieldKey; } // namespace rdc } // namespace amd diff --git a/include/rdc_lib/RdcWatchTable.h b/include/rdc_lib/RdcWatchTable.h index 34f01750b0..4adb2c39b7 100644 --- a/include/rdc_lib/RdcWatchTable.h +++ b/include/rdc_lib/RdcWatchTable.h @@ -36,8 +36,10 @@ class RdcWatchTable { virtual rdc_status_t rdc_field_update_all() = 0; virtual rdc_status_t rdc_job_start_stats(rdc_gpu_group_t group_id, - char job_id[64], uint64_t update_freq) = 0; - virtual rdc_status_t rdc_job_stop_stats(char job_id[64]) = 0; + char job_id[64], uint64_t update_freq, + const rdc_gpu_gauges_t& gpu_gauge) = 0; + virtual rdc_status_t rdc_job_stop_stats(char job_id[64], + const rdc_gpu_gauges_t& gpu_gauge) = 0; virtual rdc_status_t rdc_job_remove(char job_id[64]) = 0; virtual rdc_status_t rdc_job_remove_all() = 0; diff --git a/include/rdc_lib/impl/RdcCacheManagerImpl.h b/include/rdc_lib/impl/RdcCacheManagerImpl.h index 952f376657..f8bf9d9ebd 100644 --- a/include/rdc_lib/impl/RdcCacheManagerImpl.h +++ b/include/rdc_lib/impl/RdcCacheManagerImpl.h @@ -53,6 +53,8 @@ struct FieldSummaryStats { struct GpuSummaryStats { uint64_t energy_consumed; uint64_t energy_last_time; + uint64_t ecc_correct_init; // Init counter when job starts + uint64_t ecc_uncorrect_init; // Init counter when job starts std::map field_summaries; }; @@ -80,12 +82,14 @@ class RdcCacheManagerImpl: public RdcCacheManager { std::string get_cache_stats() override; rdc_status_t rdc_job_get_stats(char job_id[64], - const rdc_gpu_total_memory_t& total_memory, + const rdc_gpu_gauges_t& gpu_gauges, rdc_job_info_t* p_job_info) override; rdc_status_t rdc_job_start_stats(char job_id[64], const rdc_group_info_t& group, - const rdc_field_group_info_t& finfo) override; - rdc_status_t rdc_job_stop_stats(char job_id[64]) override; + const rdc_field_group_info_t& finfo, + const rdc_gpu_gauges_t& gpu_gauges) override; + rdc_status_t rdc_job_stop_stats(char job_id[64], + const rdc_gpu_gauges_t& gpu_gauge) override; rdc_status_t rdc_update_job_stats(uint32_t gpu_index, const std::string& job_id, const rdc_field_value& value) override; diff --git a/include/rdc_lib/impl/RdcEmbeddedHandler.h b/include/rdc_lib/impl/RdcEmbeddedHandler.h index 57d4f73d96..cb440f1c16 100644 --- a/include/rdc_lib/impl/RdcEmbeddedHandler.h +++ b/include/rdc_lib/impl/RdcEmbeddedHandler.h @@ -92,6 +92,7 @@ class RdcEmbeddedHandler: public RdcHandler { ~RdcEmbeddedHandler(); private: + rdc_status_t get_gpu_gauges(rdc_gpu_gauges_t* gpu_gauges); RdcGroupSettingsPtr group_settings_; RdcCacheManagerPtr cache_mgr_; RdcMetricFetcherPtr metric_fetcher_; diff --git a/include/rdc_lib/impl/RdcMetricFetcherImpl.h b/include/rdc_lib/impl/RdcMetricFetcherImpl.h index cac7ffc0e4..886c58e84e 100644 --- a/include/rdc_lib/impl/RdcMetricFetcherImpl.h +++ b/include/rdc_lib/impl/RdcMetricFetcherImpl.h @@ -22,16 +22,55 @@ THE SOFTWARE. #ifndef RDC_LIB_IMPL_RDCMETRICFETCHERIMPL_H_ #define RDC_LIB_IMPL_RDCMETRICFETCHERIMPL_H_ +#include // NOLINT(build/c++11) +#include // NOLINT(build/c++11) +#include // NOLINT(build/c++11) +#include +#include #include "rdc_lib/RdcMetricFetcher.h" +#include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { +//!< Some metrics, like PCIe throughput may take a second to retreive. The +//!< MetricValue will cache those metrics for async retreive. +struct MetricValue { + uint64_t cache_ttl; + uint64_t last_time; + rdc_field_value value; +}; + + +//!< The data structure to store the async fetch task +class RdcMetricFetcherImpl; +struct MetricTask { + RdcFieldKey field; + std::function task; +}; + class RdcMetricFetcherImpl: public RdcMetricFetcher { public: rdc_status_t fetch_smi_field(uint32_t gpu_index, uint32_t field_id, rdc_field_value* value) override; bool is_field_valid(uint32_t field_id) const override; + RdcMetricFetcherImpl(); + ~RdcMetricFetcherImpl(); + private: + uint64_t now(); + void get_ecc_error(uint32_t gpu_index, + uint32_t field_id, rdc_field_value* value); + void async_get_pcie_throughput(uint32_t gpu_index, + uint32_t field_id, rdc_field_value* value); + void get_pcie_throughput(const RdcFieldKey& key); + + //!< Async metric retreive + std::map async_metrics_; + std::queue updated_tasks_; + std::mutex task_mutex_; + std::future updater_; // keep the future of updater + std::condition_variable cv_; + std::atomic task_started_; }; } // namespace rdc diff --git a/include/rdc_lib/impl/RdcWatchTableImpl.h b/include/rdc_lib/impl/RdcWatchTableImpl.h index 266fd91911..15977f2a59 100644 --- a/include/rdc_lib/impl/RdcWatchTableImpl.h +++ b/include/rdc_lib/impl/RdcWatchTableImpl.h @@ -54,8 +54,10 @@ struct JobWatchTableEntry { class RdcWatchTableImpl : public RdcWatchTable { public: rdc_status_t rdc_job_start_stats(rdc_gpu_group_t group_id, - char job_id[64], uint64_t update_freq) override; - rdc_status_t rdc_job_stop_stats(char job_id[64]) override; + char job_id[64], uint64_t update_freq, + const rdc_gpu_gauges_t& gpu_gauge) override; + rdc_status_t rdc_job_stop_stats(char job_id[64], + const rdc_gpu_gauges_t& gpu_gauge) override; rdc_status_t rdc_job_remove(char job_id[64]) override; rdc_status_t rdc_job_remove_all() override; diff --git a/include/rdc_lib/rdc_common.h b/include/rdc_lib/rdc_common.h index d08ddae6fa..46a1a01e1c 100644 --- a/include/rdc_lib/rdc_common.h +++ b/include/rdc_lib/rdc_common.h @@ -23,6 +23,8 @@ THE SOFTWARE. #ifndef RDC_LIB_RDC_COMMON_H_ #define RDC_LIB_RDC_COMMON_H_ #include +#include +#include #define RDC_ERROR 0 #define RDC_INFO 1 @@ -37,6 +39,12 @@ THE SOFTWARE. } \ } while (0) +// +typedef std::pair RdcFieldKey; + +//!< The gauge metrics do not require aggregations +typedef std::map rdc_gpu_gauges_t; + /** * @brief The strncpy but with null terminated * diff --git a/protos/rdc.proto b/protos/rdc.proto index 00ab6f4cc2..6e9a1b9627 100755 --- a/protos/rdc.proto +++ b/protos/rdc.proto @@ -427,6 +427,12 @@ message GpuUsageInfo { JobStatsSummary gpu_utilization = 7; uint64 max_gpu_memory_used = 8; JobStatsSummary memory_utilization = 9; + uint64 ecc_correct = 10; + uint64 ecc_uncorrect = 11; + JobStatsSummary pcie_tx = 12; + JobStatsSummary pcie_rx = 13; + JobStatsSummary memory_clock = 14; + JobStatsSummary gpu_temperature = 15; } message GetJobStatsResponse { uint32 status = 1; diff --git a/rdc_libs/bootstrap/src/RdcBootStrap.cc b/rdc_libs/bootstrap/src/RdcBootStrap.cc index 0326bd0f26..392b3782de 100644 --- a/rdc_libs/bootstrap/src/RdcBootStrap.cc +++ b/rdc_libs/bootstrap/src/RdcBootStrap.cc @@ -358,6 +358,12 @@ const char* field_id_string(uint32_t field_id) { {RDC_FI_GPU_UTIL, "GPU_UTIL"}, {RDC_FI_GPU_TEMP, "GPU_TEMP"}, {RDC_FI_GPU_COUNT, "GPU_COUNT"}, + {RDC_FI_MEM_CLOCK, "MEM_CLOCK"}, + {RDC_FI_PCIE_TX, "PCIE_TX"}, + {RDC_FI_PCIE_RX, "PCIE_RX"}, + {RDC_FI_ECC_CORRECT_TOTAL, "ECC_CORRECT"}, + {RDC_FI_ECC_UNCORRECT_TOTAL, "ECC_UNCORRECT"}, + {RDC_FI_MEMORY_TEMP, "MEMORY_TEMP"}, {RDC_FI_DEV_NAME, "DEV_NAME"} }; diff --git a/rdc_libs/rdc/src/RdcCacheManagerImpl.cc b/rdc_libs/rdc/src/RdcCacheManagerImpl.cc index b8d503c9c1..e86e0f435d 100644 --- a/rdc_libs/rdc/src/RdcCacheManagerImpl.cc +++ b/rdc_libs/rdc/src/RdcCacheManagerImpl.cc @@ -235,7 +235,11 @@ rdc_status_t RdcCacheManagerImpl::rdc_update_job_stats(uint32_t gpu_index, void RdcCacheManagerImpl::set_summary(const FieldSummaryStats & stats, rdc_stats_summary_t & gpu, rdc_stats_summary_t& summary, unsigned int adjuster) { - if (stats.count == 0) return; + if (stats.count == 0) { + gpu.min_value = std::numeric_limits::max(); + gpu.max_value = gpu.average = 0; + return; + } gpu.max_value = stats.max_value / adjuster; gpu.min_value = stats.min_value / adjuster; @@ -247,7 +251,7 @@ void RdcCacheManagerImpl::set_summary(const FieldSummaryStats & stats, } rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(char jobId[64], - const rdc_gpu_total_memory_t& total_memory, + const rdc_gpu_gauges_t& gpu_gauges, rdc_job_info_t* p_job_info) { std::lock_guard guard(cache_mutex_); auto job_stats = cache_jobs_.find(jobId); @@ -257,6 +261,7 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(char jobId[64], } //< Init the summary info + bool is_job_stopped = (job_stats->second.end_time != 0); RDC_LOG(RDC_DEBUG, "rdc_job_get_stats for job " << jobId); auto& summary_info = p_job_info->summary; summary_info.start_time = job_stats->second.start_time; @@ -267,7 +272,13 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(char jobId[64], } summary_info.energy_consumed = 0; summary_info.max_gpu_memory_used = 0; + summary_info.ecc_correct = 0; + summary_info.ecc_uncorrect = 0; summary_info.power_usage = {0, std::numeric_limits::max(), 0}; + summary_info.pcie_tx = {0, std::numeric_limits::max(), 0}; + summary_info.pcie_rx = {0, std::numeric_limits::max(), 0}; + summary_info.gpu_temperature = {0, std::numeric_limits::max(), 0}; + summary_info.memory_clock = {0, std::numeric_limits::max(), 0}; summary_info.gpu_clock = {0, std::numeric_limits::max(), 0}; summary_info.gpu_utilization = {0, std::numeric_limits::max(), 0}; summary_info.memory_utilization = {0, @@ -285,13 +296,46 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(char jobId[64], gpu_info.energy_consumed = gpus->second.energy_consumed; summary_info.energy_consumed += gpu_info.energy_consumed; + if (is_job_stopped) { + gpu_info.ecc_correct = gpus->second.ecc_correct_init; + summary_info.ecc_correct += gpu_info.ecc_correct; + } else if (gpu_gauges.find({gpus->first, + RDC_FI_ECC_CORRECT_TOTAL}) != gpu_gauges.end()) { + gpu_info.ecc_correct = gpu_gauges.at({ + gpus->first, RDC_FI_ECC_CORRECT_TOTAL}) - + gpus->second.ecc_correct_init; + summary_info.ecc_correct += gpu_info.ecc_correct; + } else { + gpu_info.ecc_correct = 0; + } + + if (is_job_stopped) { + gpu_info.ecc_uncorrect = gpus->second.ecc_uncorrect_init; + summary_info.ecc_uncorrect += gpu_info.ecc_uncorrect; + } else if (gpu_gauges.find({gpus->first, + RDC_FI_ECC_UNCORRECT_TOTAL}) != gpu_gauges.end()) { + gpu_info.ecc_uncorrect = gpu_gauges.at({ + gpus->first, RDC_FI_ECC_UNCORRECT_TOTAL}) - + gpus->second.ecc_uncorrect_init; + summary_info.ecc_uncorrect += gpu_info.ecc_uncorrect; + } else { + gpu_info.ecc_uncorrect = 0; + } + + if (gpu_gauges.find({gpus->first, + RDC_FI_GPU_MEMORY_TOTAL}) == gpu_gauges.end()) { + RDC_LOG(RDC_ERROR, "Cannot find the total memory"); + return RDC_ST_BAD_PARAMETER; + } + uint64_t tmemory = gpu_gauges.at({gpus->first, + RDC_FI_GPU_MEMORY_TOTAL}); + auto ite = gpus->second.field_summaries.begin(); for (; ite != gpus->second.field_summaries.end(); ite++) { if (ite->first == RDC_FI_POWER_USAGE) { set_summary(ite->second, gpu_info.power_usage, summary_info.power_usage, 1000000); } else if (ite->first == RDC_FI_GPU_MEMORY_USAGE) { - auto tmemory = total_memory.at(gpus->first); set_summary(ite->second, gpu_info.memory_utilization, summary_info.memory_utilization, tmemory/100); gpu_info.max_gpu_memory_used = ite->second.max_value; @@ -304,6 +348,18 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(char jobId[64], } else if (ite->first == RDC_FI_GPU_UTIL) { set_summary(ite->second, gpu_info.gpu_utilization, summary_info.gpu_utilization, 1); + } else if (ite->first == RDC_FI_GPU_TEMP) { + set_summary(ite->second, + gpu_info.gpu_temperature, summary_info.gpu_temperature, 1000); + } else if (ite->first == RDC_FI_MEM_CLOCK) { + set_summary(ite->second, + gpu_info.memory_clock, summary_info.memory_clock, 1000000); + } else if (ite->first == RDC_FI_PCIE_TX) { + set_summary(ite->second, + gpu_info.pcie_tx, summary_info.pcie_tx, 1024*1024); + } else if (ite->first == RDC_FI_PCIE_RX) { + set_summary(ite->second, + gpu_info.pcie_rx, summary_info.pcie_rx, 1024*1024); } } } @@ -316,12 +372,21 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(char jobId[64], p_job_info->num_gpus; summary_info.memory_utilization.average = summary_info.memory_utilization.average/p_job_info->num_gpus; + summary_info.pcie_tx.average = summary_info.pcie_tx.average/ + p_job_info->num_gpus; + summary_info.pcie_rx.average = summary_info.pcie_rx.average/ + p_job_info->num_gpus; + summary_info.gpu_temperature.average = summary_info.gpu_temperature.average/ + p_job_info->num_gpus; + summary_info.memory_clock.average = summary_info.memory_clock.average/ + p_job_info->num_gpus; return RDC_ST_OK; } rdc_status_t RdcCacheManagerImpl::rdc_job_start_stats(char job_id[64], - const rdc_group_info_t& ginfo, const rdc_field_group_info_t& finfo) { + const rdc_group_info_t& ginfo, const rdc_field_group_info_t& finfo, + const rdc_gpu_gauges_t& gpu_gauges) { RdcJobStatsCacheEntry cacheEntry; cacheEntry.start_time = std::time(nullptr); cacheEntry.end_time = 0; @@ -336,6 +401,20 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_start_stats(char job_id[64], gstats.field_summaries.insert({finfo.field_ids[j], s}); } + gstats.ecc_correct_init = 0; + if (gpu_gauges.find({ginfo.entity_ids[i], RDC_FI_ECC_CORRECT_TOTAL}) != + gpu_gauges.end()) { + gstats.ecc_correct_init = gpu_gauges.at( + {ginfo.entity_ids[i], RDC_FI_ECC_CORRECT_TOTAL}); + } + + gstats.ecc_uncorrect_init = 0; + if (gpu_gauges.find({ginfo.entity_ids[i], RDC_FI_ECC_UNCORRECT_TOTAL}) != + gpu_gauges.end()) { + gstats.ecc_uncorrect_init = gpu_gauges.at( + {ginfo.entity_ids[i], RDC_FI_ECC_UNCORRECT_TOTAL}); + } + cacheEntry.gpu_stats.insert({ginfo.entity_ids[i], gstats}); } @@ -347,7 +426,8 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_start_stats(char job_id[64], } -rdc_status_t RdcCacheManagerImpl::rdc_job_stop_stats(char job_id[64]) { +rdc_status_t RdcCacheManagerImpl::rdc_job_stop_stats(char job_id[64], + const rdc_gpu_gauges_t& gpu_gauges) { std::lock_guard guard(cache_mutex_); auto job_stats = cache_jobs_.find(job_id); @@ -357,6 +437,24 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_stop_stats(char job_id[64]) { job_stats->second.end_time = std::time(nullptr); + // update the ecc errors + auto gpus = job_stats->second.gpu_stats.begin(); + for (; gpus != job_stats->second.gpu_stats.end(); gpus++) { + if (gpu_gauges.find({gpus->first, + RDC_FI_ECC_CORRECT_TOTAL}) != gpu_gauges.end()) { + gpus->second.ecc_correct_init = gpu_gauges.at({ + gpus->first, RDC_FI_ECC_CORRECT_TOTAL}) - + gpus->second.ecc_correct_init; + } + + if (gpu_gauges.find({gpus->first, + RDC_FI_ECC_UNCORRECT_TOTAL}) != gpu_gauges.end()) { + gpus->second.ecc_uncorrect_init = gpu_gauges.at({ + gpus->first, RDC_FI_ECC_UNCORRECT_TOTAL}) - + gpus->second.ecc_uncorrect_init; + } + } + return RDC_ST_OK; } diff --git a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc index fa5e3d3dc1..bd02f1b7d6 100644 --- a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc +++ b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc @@ -86,38 +86,75 @@ RdcEmbeddedHandler::~RdcEmbeddedHandler() { // JOB API rdc_status_t RdcEmbeddedHandler::rdc_job_start_stats(rdc_gpu_group_t groupId, char job_id[64], uint64_t update_freq) { - return watch_table_->rdc_job_start_stats(groupId, job_id, update_freq); + rdc_gpu_gauges_t gpu_gauges; + rdc_status_t status = get_gpu_gauges(&gpu_gauges); + if (status != RDC_ST_OK) return status; + + return watch_table_->rdc_job_start_stats(groupId, job_id, update_freq, + gpu_gauges); } -rdc_status_t RdcEmbeddedHandler::rdc_job_get_stats(char job_id[64], - rdc_job_info_t* p_job_info) { +rdc_status_t RdcEmbeddedHandler::get_gpu_gauges(rdc_gpu_gauges_t* gpu_gauges) { uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES]; uint32_t count = 0; + + if (gpu_gauges == nullptr) { + return RDC_ST_BAD_PARAMETER; + } rdc_status_t status = rdc_device_get_all( gpu_index_list, &count); if (status != RDC_ST_OK) { return status; } - rdc_gpu_total_memory_t all_total_memory; - + // Fetch total memory and current ecc errors for (uint32_t i = 0; i < count ; i++) { - rdc_field_value total_memory; + rdc_field_value value; status = metric_fetcher_->fetch_smi_field(gpu_index_list[i], - RDC_FI_GPU_MEMORY_TOTAL, &total_memory); + RDC_FI_GPU_MEMORY_TOTAL, &value); if (status != RDC_ST_OK) { RDC_LOG(RDC_ERROR, "Fail to get total memory of GPU " << gpu_index_list[i]); return status; } - all_total_memory.insert({gpu_index_list[i], total_memory.value.l_int}); + gpu_gauges->insert({{gpu_index_list[i], RDC_FI_GPU_MEMORY_TOTAL}, + value.value.l_int}); + + status = metric_fetcher_->fetch_smi_field(gpu_index_list[i], + RDC_FI_ECC_CORRECT_TOTAL, &value); + if (status == RDC_ST_OK) { + gpu_gauges->insert({{gpu_index_list[i], RDC_FI_ECC_CORRECT_TOTAL}, + value.value.l_int}); + } + status = metric_fetcher_->fetch_smi_field(gpu_index_list[i], + RDC_FI_ECC_UNCORRECT_TOTAL, &value); + if (status == RDC_ST_OK) { + gpu_gauges->insert({{gpu_index_list[i], RDC_FI_ECC_UNCORRECT_TOTAL}, + value.value.l_int}); + } + } + return RDC_ST_OK; +} + +rdc_status_t RdcEmbeddedHandler::rdc_job_get_stats(char job_id[64], + rdc_job_info_t* p_job_info) { + if (p_job_info == nullptr) { + return RDC_ST_BAD_PARAMETER; } - return cache_mgr_->rdc_job_get_stats(job_id, all_total_memory, p_job_info); + rdc_gpu_gauges_t gpu_gauges; + rdc_status_t status = get_gpu_gauges(&gpu_gauges); + if (status != RDC_ST_OK) return status; + + return cache_mgr_->rdc_job_get_stats(job_id, gpu_gauges, p_job_info); } rdc_status_t RdcEmbeddedHandler::rdc_job_stop_stats(char job_id[64]) { - return watch_table_->rdc_job_stop_stats(job_id); + rdc_gpu_gauges_t gpu_gauges; + rdc_status_t status = get_gpu_gauges(&gpu_gauges); + if (status != RDC_ST_OK) return status; + + return watch_table_->rdc_job_stop_stats(job_id, gpu_gauges); } rdc_status_t RdcEmbeddedHandler::rdc_job_remove(char job_id[64]) { diff --git a/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc b/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc index eb89ddb505..9ba40b7130 100644 --- a/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc +++ b/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc @@ -30,7 +30,9 @@ namespace rdc { RdcGroupSettingsImpl::RdcGroupSettingsImpl() { // Add the default job stats fields uint32_t job_fields[] = {RDC_FI_GPU_MEMORY_USAGE, - RDC_FI_POWER_USAGE, RDC_FI_GPU_SM_CLOCK, RDC_FI_GPU_UTIL}; + RDC_FI_POWER_USAGE, RDC_FI_GPU_SM_CLOCK, RDC_FI_GPU_UTIL, + RDC_FI_PCIE_TX, RDC_FI_PCIE_RX, RDC_FI_MEM_CLOCK, + RDC_FI_GPU_TEMP}; char job_field_group[] = "JobStatsFields"; rdc_field_grp_t fgid = JOB_FIELD_ID; diff --git a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index 114510b8e5..34200ced8a 100644 --- a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -35,18 +35,192 @@ namespace rdc { bool RdcMetricFetcherImpl::is_field_valid(uint32_t field_id) const { const std::vector all_fields = {RDC_FI_GPU_MEMORY_USAGE, RDC_FI_GPU_MEMORY_TOTAL, RDC_FI_GPU_COUNT, RDC_FI_POWER_USAGE, - RDC_FI_GPU_SM_CLOCK, RDC_FI_GPU_UTIL, RDC_FI_DEV_NAME, RDC_FI_GPU_TEMP}; + RDC_FI_GPU_SM_CLOCK, RDC_FI_GPU_UTIL, RDC_FI_DEV_NAME, RDC_FI_GPU_TEMP, + RDC_FI_MEM_CLOCK, RDC_FI_PCIE_TX, RDC_FI_PCIE_RX, + RDC_FI_ECC_CORRECT_TOTAL, RDC_FI_ECC_UNCORRECT_TOTAL, RDC_FI_MEMORY_TEMP}; return std::find(all_fields.begin(), all_fields.end(), field_id) != all_fields.end(); } +RdcMetricFetcherImpl::RdcMetricFetcherImpl() { + task_started_ = true; + + // kick off another thread for async fetch + updater_ = std::async(std::launch::async, [this]() { + while (task_started_) { + std::unique_lock lk(task_mutex_); + // Wait for tasks or stop signal + cv_.wait(lk, [this]{ + return !updated_tasks_.empty() || !task_started_; + }); + if (updated_tasks_.empty()) continue; + + // Get the tasks + auto item = updated_tasks_.front(); + updated_tasks_.pop(); + // The task may take long time, release lock + lk.unlock(); + + // run task + item.task(*this, item.field); + } // end while (task_started_) + }); +} + +RdcMetricFetcherImpl::~RdcMetricFetcherImpl() { + // Notify the async task to stop + task_started_ = false; + cv_.notify_all(); +} + +uint64_t RdcMetricFetcherImpl::now() { + struct timeval tv; + gettimeofday(&tv, NULL); + return static_cast(tv.tv_sec) * 1000 + tv.tv_usec / 1000; +} + +void RdcMetricFetcherImpl::get_ecc_error(uint32_t gpu_index, + uint32_t field_id, rdc_field_value* value) { + rsmi_status_t err = RSMI_STATUS_SUCCESS; + uint64_t correctable_err = 0; + uint64_t uncorrectable_err = 0; + rsmi_ras_err_state_t err_state; + + if (!value) { + return; + } + for (uint32_t b = RSMI_GPU_BLOCK_FIRST; + b <= RSMI_GPU_BLOCK_LAST; b = b*2) { + err = rsmi_dev_ecc_status_get(gpu_index, static_cast(b), + &err_state); + if (err != RSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_INFO, "Get the ecc Status error " << b + << ":" << err); + continue; + } + + rsmi_error_count_t ec; + err = rsmi_dev_ecc_count_get(gpu_index, + static_cast(b), &ec); + + if (err == RSMI_STATUS_SUCCESS) { + correctable_err += ec.correctable_err; + uncorrectable_err += ec.uncorrectable_err; + } + } + + value->status = RSMI_STATUS_SUCCESS; + value->type = INTEGER; + if (field_id == RDC_FI_ECC_CORRECT_TOTAL) { + value->value.l_int = correctable_err; + } + if (field_id == RDC_FI_ECC_UNCORRECT_TOTAL) { + value->value.l_int = uncorrectable_err; + } +} + +void RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index, + uint32_t field_id, rdc_field_value* value) { + if (!value) { + return; + } + + do { + std::lock_guard guard(task_mutex_); + auto metric = async_metrics_.find({gpu_index, field_id}); + if ( metric != async_metrics_.end() ) { + if (now() < metric->second.last_time + metric->second.cache_ttl) { + RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" << + field_id_string(field_id) << " from cache"); + value->status = metric->second.value.status; + value->type = metric->second.value.type; + value->value = metric->second.value.value; + return; + } + } + + // add to the async task queue + MetricTask t; + t.field = {gpu_index, field_id}; + t.task = &RdcMetricFetcherImpl::get_pcie_throughput; + updated_tasks_.push(t); + + RDC_LOG(RDC_DEBUG, "Start async fetch " << gpu_index << ":" << + field_id_string(field_id) << " to cache."); + } while (0); + cv_.notify_all(); +} + +void RdcMetricFetcherImpl::get_pcie_throughput(const RdcFieldKey& key) { + uint32_t gpu_index = key.first; + uint64_t sent, received, max_pkt_sz; + rsmi_status_t ret; + + // Return if the cache does not expire yet + do { + std::lock_guard guard(task_mutex_); + auto metric = async_metrics_.find(key); + if (metric != async_metrics_.end() && + now() < metric->second.last_time + metric->second.cache_ttl) { + return; + } + } while (0); + + ret = rsmi_dev_pci_throughput_get(gpu_index, &sent, &received, &max_pkt_sz); + + uint64_t curTime = now(); + MetricValue value; + value.cache_ttl = 30*1000; // cache 30 seconds + value.value.type = INTEGER; + do { + std::lock_guard guard(task_mutex_); + // Create new cache entry it does not exist + auto tx_metric = async_metrics_.find({gpu_index, RDC_FI_PCIE_TX}); + if (tx_metric == async_metrics_.end()) { + tx_metric = async_metrics_.insert( + {{gpu_index, RDC_FI_PCIE_TX}, value}).first; + tx_metric->second.value.field_id = RDC_FI_PCIE_TX; + } + auto rx_metric = async_metrics_.find({gpu_index, RDC_FI_PCIE_RX}); + if (rx_metric == async_metrics_.end()) { + rx_metric = async_metrics_.insert( + {{gpu_index, RDC_FI_PCIE_RX}, value}).first; + rx_metric->second.value.field_id = RDC_FI_PCIE_RX; + } + + // Always update the status and last_time + tx_metric->second.last_time = curTime; + tx_metric->second.value.status = ret; + tx_metric->second.value.ts = curTime; + + rx_metric->second.last_time = curTime; + rx_metric->second.value.status = ret; + rx_metric->second.value.ts = curTime; + + if (ret == RSMI_STATUS_NOT_SUPPORTED) { + RDC_LOG(RDC_ERROR, + "PCIe throughput not supported on GPU " << gpu_index); + return; + } + + if (ret == RSMI_STATUS_SUCCESS) { + rx_metric->second.value.value.l_int = received; + tx_metric->second.value.value.l_int = sent; + RDC_LOG(RDC_DEBUG, "Async updated " << gpu_index << ":" << + "RDC_FI_PCIE_RX and RDC_FI_PCIE_TX to cache."); + } + } while (0); +} + rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, uint32_t field_id, rdc_field_value* value) { if (!value) { return RDC_ST_BAD_PARAMETER; } uint64_t i64 = 0; + rsmi_temperature_type_t sensor_type; + rsmi_clk_type_t clk_type; if (!is_field_valid(field_id)) { RDC_LOG(RDC_ERROR, "Fail to fetch field " << field_id @@ -54,9 +228,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, return RDC_ST_NOT_SUPPORTED; } - struct timeval tv; - gettimeofday(&tv, NULL); - value->ts = static_cast(tv.tv_sec) * 1000 + tv.tv_usec / 1000; + value->ts = now(); value->field_id = field_id; value->status = RSMI_STATUS_NOT_SUPPORTED; @@ -94,9 +266,14 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, } break; case RDC_FI_GPU_SM_CLOCK: + case RDC_FI_MEM_CLOCK: rsmi_frequencies_t f; + clk_type = RSMI_CLK_TYPE_SYS; + if (field_id == RDC_FI_MEM_CLOCK) { + clk_type = RSMI_CLK_TYPE_MEM; + } value->status = rsmi_dev_gpu_clk_freq_get(gpu_index, - RSMI_CLK_TYPE_SYS, &f); + clk_type, &f); value->type = INTEGER; if (value->status == RSMI_STATUS_SUCCESS) { value->value.l_int = f.frequency[f.current]; @@ -116,21 +293,33 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, value->type = STRING; break; case RDC_FI_GPU_TEMP: + case RDC_FI_MEMORY_TEMP: int64_t val_i64; + sensor_type = RSMI_TEMP_TYPE_EDGE; + if (field_id == RDC_FI_MEMORY_TEMP) { + sensor_type = RSMI_TEMP_TYPE_MEMORY; + } value->status = rsmi_dev_temp_metric_get(gpu_index, - 0, RSMI_TEMP_CURRENT, &val_i64); + sensor_type , RSMI_TEMP_CURRENT, &val_i64); + value->type = INTEGER; if (value->status == RSMI_STATUS_SUCCESS) { value->value.l_int = val_i64; } break; + case RDC_FI_ECC_CORRECT_TOTAL: + case RDC_FI_ECC_UNCORRECT_TOTAL: + get_ecc_error(gpu_index, field_id, value); + break; + case RDC_FI_PCIE_TX: + case RDC_FI_PCIE_RX: + async_get_pcie_throughput(gpu_index, field_id, value); + break; default: break; } - gettimeofday(&tv, NULL); - int64_t latency = static_cast(tv.tv_sec)*1000+tv.tv_usec/1000 - - value->ts; + int64_t latency = now()-value->ts; if (value->status != RSMI_STATUS_SUCCESS) { RDC_LOG(RDC_ERROR, "Fail to fetch " << gpu_index << ":" << field_id_string(field_id) << " with rsmi error code " diff --git a/rdc_libs/rdc/src/RdcWatchTableImpl.cc b/rdc_libs/rdc/src/RdcWatchTableImpl.cc index 7134a51992..7ed2663ae6 100644 --- a/rdc_libs/rdc/src/RdcWatchTableImpl.cc +++ b/rdc_libs/rdc/src/RdcWatchTableImpl.cc @@ -41,7 +41,8 @@ RdcWatchTableImpl::RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings, } rdc_status_t RdcWatchTableImpl::rdc_job_start_stats(rdc_gpu_group_t group_id, - char job_id[64], uint64_t update_freq) { + char job_id[64], uint64_t update_freq, + const rdc_gpu_gauges_t& gpu_gauges) { do { //< lock guard for thread safe std::lock_guard guard(watch_mutex_); if (job_watch_table_.find(job_id) != job_watch_table_.end()) { @@ -67,10 +68,6 @@ rdc_status_t RdcWatchTableImpl::rdc_job_start_stats(rdc_gpu_group_t group_id, job_watch_table_.insert({job_id, jentry}); } while (0); - result = rdc_field_watch(group_id, JOB_FIELD_ID, update_freq, 0, 0); - if (result != RDC_ST_OK) { - return result; - } rdc_field_group_info_t finfo; rdc_group_info_t ginfo; @@ -85,12 +82,18 @@ rdc_status_t RdcWatchTableImpl::rdc_job_start_stats(rdc_gpu_group_t group_id, return result; } - result = cache_mgr_->rdc_job_start_stats(job_id, ginfo, finfo); + result = cache_mgr_->rdc_job_start_stats(job_id, ginfo, finfo, gpu_gauges); + if (result != RDC_ST_OK) { + return result; + } + // At last, when every thing sets up, starts to watch the fields. + result = rdc_field_watch(group_id, JOB_FIELD_ID, update_freq, 0, 0); return result; } -rdc_status_t RdcWatchTableImpl::rdc_job_stop_stats(char job_id[64]) { +rdc_status_t RdcWatchTableImpl::rdc_job_stop_stats(char job_id[64], + const rdc_gpu_gauges_t& gpu_gauge) { uint32_t job_group_id; do { //< lock guard for thread safe std::lock_guard guard(watch_mutex_); @@ -111,13 +114,14 @@ rdc_status_t RdcWatchTableImpl::rdc_job_stop_stats(char job_id[64]) { job_watch_table_.erase(job_id); } while (0); - result = cache_mgr_->rdc_job_stop_stats(job_id); + result = cache_mgr_->rdc_job_stop_stats(job_id, gpu_gauge); return result; } rdc_status_t RdcWatchTableImpl::rdc_job_remove(char job_id[64]) { - rdc_job_stop_stats(job_id); + rdc_gpu_gauges_t gpu_gauge; + rdc_job_stop_stats(job_id, gpu_gauge); return cache_mgr_->rdc_job_remove(job_id); } @@ -134,7 +138,8 @@ rdc_status_t RdcWatchTableImpl::rdc_job_remove_all() { // Stop them for (auto job = v.begin(); job != v.end(); job++) { - rdc_job_stop_stats(const_cast(job->c_str())); + rdc_gpu_gauges_t gpu_gauge; + rdc_job_stop_stats(const_cast(job->c_str()), gpu_gauge); } return cache_mgr_->rdc_job_remove_all(); @@ -340,8 +345,9 @@ rdc_status_t RdcWatchTableImpl::rdc_field_update_all() { auto fite = fields_to_watch_.begin(); for (; fite != fields_to_watch_.end(); fite++) { // Is this field need to be updated? + uint64_t track_freq = fite->second.update_freq/1000; if (!fite->second.is_watching || - fite->second.last_update_time+fite->second.update_freq/1000 > now) { + fite->second.last_update_time+track_freq > now) { continue; } @@ -350,6 +356,10 @@ rdc_status_t RdcWatchTableImpl::rdc_field_update_all() { result = metric_fetcher_->fetch_smi_field( fite->first.first, fite->first.second, &value); if (result != RDC_ST_OK) { + // To prevent frequently retry when error, update the time + gettimeofday(&tv, NULL); + now = static_cast(tv.tv_sec)*1000+tv.tv_usec/1000; + fite->second.last_update_time = now; continue; } diff --git a/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc b/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc index d6a845a708..12d63318ef 100644 --- a/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc +++ b/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc @@ -88,6 +88,8 @@ bool RdcStandaloneHandler::copy_gpu_usage_info( target->end_time = src.end_time(); target->energy_consumed = src.energy_consumed(); target->max_gpu_memory_used = src.max_gpu_memory_used(); + target->ecc_correct = src.ecc_correct(); + target->ecc_uncorrect = src.ecc_uncorrect(); const ::rdc::JobStatsSummary& pstats = src.power_usage(); target->power_usage.max_value = pstats.max_value(); @@ -109,6 +111,26 @@ bool RdcStandaloneHandler::copy_gpu_usage_info( target->memory_utilization.min_value = mstats.min_value(); target->memory_utilization.average = mstats.average(); + const ::rdc::JobStatsSummary& txstats = src.pcie_tx(); + target->pcie_tx.max_value = txstats.max_value(); + target->pcie_tx.min_value = txstats.min_value(); + target->pcie_tx.average = txstats.average(); + + const ::rdc::JobStatsSummary& rxstats = src.pcie_rx(); + target->pcie_rx.max_value = rxstats.max_value(); + target->pcie_rx.min_value = rxstats.min_value(); + target->pcie_rx.average = rxstats.average(); + + const ::rdc::JobStatsSummary& mcstats = src.memory_clock(); + target->memory_clock.max_value = mcstats.max_value(); + target->memory_clock.min_value = mcstats.min_value(); + target->memory_clock.average = mcstats.average(); + + const ::rdc::JobStatsSummary& gtstats = src.gpu_temperature(); + target->gpu_temperature.max_value = gtstats.max_value(); + target->gpu_temperature.min_value = gtstats.min_value(); + target->gpu_temperature.average = gtstats.average(); + return true; } rdc_status_t RdcStandaloneHandler::rdc_job_get_stats(char job_id[64], diff --git a/rdci/src/RdciDmonSubSystem.cc b/rdci/src/RdciDmonSubSystem.cc index 0c477996f7..371eb23d0d 100644 --- a/rdci/src/RdciDmonSubSystem.cc +++ b/rdci/src/RdciDmonSubSystem.cc @@ -278,10 +278,19 @@ void RdciDmonSubSystem::create_temp_field_group() { void RdciDmonSubSystem::show_field_usage() const { std::cout << "Supported fields Ids:\n"; std::cout << "100 RDC_FI_GPU_SM_CLOCK: Current GPU clock frequencies.\n"; + std::cout << "101 RDC_FI_MEM_CLOCK: Current Memory clock frequencies.\n"; + std::cout << "140 RDC_FI_MEMORY_TEMP: Memory " + << "temperature in millidegrees Celsius.\n"; std::cout << "150 RDC_FI_GPU_TEMP: GPU " - << "temperature in millidegrees Celcius.\n"; + << "temperature in millidegrees Celsius.\n"; std::cout << "155 RDC_FI_POWER_USAGE: Power usage in microwatts.\n"; + std::cout << "200 RDC_FI_PCIE_TX: PCIe Tx utilization in bytes/second.\n"; + std::cout << "201 RDC_FI_PCIE_RX: PCIe Rx utilization in bytes/second.\n"; std::cout << "203 RDC_FI_GPU_UTIL: GPU busy percentage.\n"; + std::cout << "312 RDC_FI_ECC_CORRECT_TOTAL: Accumulated " + << "correctable ECC errors.\n"; + std::cout << "313 RDC_FI_ECC_UNCORRECT_TOTAL: Accumulated " + << "uncorrectable ECC errors.\n"; std::cout << "525 RDC_FI_GPU_MEMORY_USAGE: Memory usage of the GPU " << "instance in bytes.\n"; } @@ -361,7 +370,7 @@ void RdciDmonSubSystem::process() { group_info.entity_ids[gindex], field_info.field_ids[findex], &value); if (result != RDC_ST_OK) { - std::cout << std::left << std::setw(20) << "error"; + std::cout << std::left << std::setw(20) << "N/A"; } else { if (value.type == INTEGER) { std::cout << std::left << std::setw(20) diff --git a/rdci/src/RdciGroupSubSystem.cc b/rdci/src/RdciGroupSubSystem.cc index c105333217..2c7a159f6f 100644 --- a/rdci/src/RdciGroupSubSystem.cc +++ b/rdci/src/RdciGroupSubSystem.cc @@ -82,7 +82,10 @@ void RdciGroupSubSystem::parse_cmd_opts(int argc, char ** argv) { group_name_ = optarg; break; case 'a': - group_ops_ = GROUP_ADD_GPUS; + // Create may add GPUs as well. + if (group_ops_ != GROUP_CREATE) { + group_ops_ = GROUP_ADD_GPUS; + } gpu_ids_ = optarg; break; case 'i': @@ -116,7 +119,8 @@ void RdciGroupSubSystem::show_help() const { std::cout << " group -- Used to create and maintain groups of GPUs.\n\n"; std::cout << "Usage\n"; std::cout << " rdci group [--host :port] [-u] -l\n"; - std::cout << " rdci group [--host :port] [-u] -c \n"; + std::cout << " rdci group [--host :port] [-u] -c " + << "[-a ]\n"; std::cout << " rdci group [--host :port] [-u] -g " << "[-a ]\n"; std::cout << " rdci group [--host :port] [-u] " @@ -157,6 +161,25 @@ void RdciGroupSubSystem::process() { rdc_gpu_group_t group_id; result = rdc_group_gpu_create(rdc_handle_, RDC_GROUP_EMPTY, group_name_.c_str(), &group_id); + if (result != RDC_ST_OK) { + throw RdcException(result, "Fail to create group " + + group_name_); + } + + gpu_ids = split_string(gpu_ids_, ','); + for (uint32_t i = 0; i < gpu_ids.size(); i++) { + if (!IsNumber(gpu_ids[i])) { + throw RdcException(RDC_ST_BAD_PARAMETER, + "The GPU Id "+gpu_ids[i]+" needs to be a number"); + } + result = rdc_group_gpu_add(rdc_handle_, + group_id, std::stoi(gpu_ids[i])); + if (result != RDC_ST_OK) { + throw RdcException(result, "Fail to add GPU " + + gpu_ids[i] + " to the group"); + } + } + if (result == RDC_ST_OK) { std::cout << "Successfully created group with a group ID " << group_id << std::endl; @@ -214,7 +237,7 @@ void RdciGroupSubSystem::process() { for (uint32_t i = 0; i < gpu_ids.size(); i++) { if (!IsNumber(gpu_ids[i])) { throw RdcException(RDC_ST_BAD_PARAMETER, - "The GUP Id "+gpu_ids[i]+" needs to be a number"); + "The GPU Id "+gpu_ids[i]+" needs to be a number"); } result = rdc_group_gpu_add(rdc_handle_, group_id_, std::stoi(gpu_ids[i])); diff --git a/rdci/src/RdciStatsSubSystem.cc b/rdci/src/RdciStatsSubSystem.cc index 608237932e..d8f905c7e7 100644 --- a/rdci/src/RdciStatsSubSystem.cc +++ b/rdci/src/RdciStatsSubSystem.cc @@ -150,11 +150,11 @@ void RdciStatsSubSystem::show_job_stats( const rdc_gpu_usage_info_t& gpu_info) const { std::cout << "|------- Execution Stats ----------" << "+------------------------------------\n"; - std::cout << "| Start Time * | " + std::cout << "| Start Time | " << gpu_info.start_time << "\n"; - std::cout << "| End Time * | " + std::cout << "| End Time | " << gpu_info.end_time << "\n"; - std::cout << "| Total Execution Time (sec) * | " + std::cout << "| Total Execution Time (sec) | " << (gpu_info.end_time-gpu_info.start_time) << "\n"; std::cout << "+------- Performance Stats --------" << "+------------------------------------\n"; @@ -168,16 +168,36 @@ void RdciStatsSubSystem::show_job_stats( << gpu_info.gpu_clock.max_value << " Min: " << gpu_info.gpu_clock.min_value << " Avg: " << gpu_info.gpu_clock.average << "\n"; + std::cout << "| Memory Clock (MHz) | " << "Max: " + << gpu_info.memory_clock.max_value << " Min: " << + gpu_info.memory_clock.min_value << " Avg: " + << gpu_info.memory_clock.average << "\n"; std::cout << "| SM Utilization (%) | " << "Max: " << gpu_info.gpu_utilization.max_value <<" Min: " << gpu_info.gpu_utilization.min_value << " Avg: " << gpu_info.gpu_utilization.average << "\n"; - std::cout << "| Max GPU Memory Used (bytes) * | " << + std::cout << "| Max GPU Memory Used (bytes) | " << gpu_info.max_gpu_memory_used << "\n"; std::cout << "| Memory Utilization (%) | " << "Max: " << gpu_info.memory_utilization.max_value - <<" Min: "<< gpu_info.memory_utilization.min_value - << " Avg: " << gpu_info.memory_utilization.average << "\n"; + <<" Min: "<< gpu_info.memory_utilization.min_value + << " Avg: " << gpu_info.memory_utilization.average << "\n"; + std::cout << "| GPU Temperature (Celsius) | " + << "Max: " << gpu_info.gpu_temperature.max_value + <<" Min: "<< gpu_info.gpu_temperature.min_value + << " Avg: " << gpu_info.gpu_temperature.average << "\n"; + std::cout << "| PCIe Rx Bandwidth (megabytes) | " + << "Max: " << gpu_info.pcie_rx.max_value + <<" Min: "<< gpu_info.pcie_rx.min_value + << " Avg: " << gpu_info.pcie_rx.average << "\n"; + std::cout << "| PCIe Tx Bandwidth (megabytes) | " + << "Max: " << gpu_info.pcie_tx.max_value + <<" Min: "<< gpu_info.pcie_tx.min_value + << " Avg: " << gpu_info.pcie_tx.average << "\n"; + std::cout << "| Correctable ECC Errors | " + << gpu_info.ecc_correct << "\n"; + std::cout << "| Uncorrectable ECC Errors | " + << gpu_info.ecc_uncorrect << "\n"; std::cout << "+----------------------------------" << "+------------------------------------\n"; } diff --git a/server/src/rdc_api_service.cc b/server/src/rdc_api_service.cc index 8cd10dc046..e7e3119181 100755 --- a/server/src/rdc_api_service.cc +++ b/server/src/rdc_api_service.cc @@ -453,6 +453,11 @@ RdcAPIServiceImpl::~RdcAPIServiceImpl() { const_cast(request->job_id().c_str()), &job_info); + reply->set_status(result); + if (result != RDC_ST_OK) { + return ::grpc::Status::OK; + } + reply->set_num_gpus(job_info.num_gpus); ::rdc::GpuUsageInfo* sinfo = reply->mutable_summary(); copy_gpu_usage_info(job_info.summary, sinfo); @@ -462,8 +467,6 @@ RdcAPIServiceImpl::~RdcAPIServiceImpl() { copy_gpu_usage_info(job_info.gpus[i], ginfo); } - reply->set_status(result); - return ::grpc::Status::OK; } @@ -478,6 +481,8 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, target->set_end_time(src.end_time); target->set_energy_consumed(src.energy_consumed); target->set_max_gpu_memory_used(src.max_gpu_memory_used); + target->set_ecc_correct(src.ecc_correct); + target->set_ecc_uncorrect(src.ecc_uncorrect); ::rdc::JobStatsSummary* stats = target->mutable_power_usage(); stats->set_max_value(src.power_usage.max_value); @@ -499,6 +504,25 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, stats->set_min_value(src.memory_utilization.min_value); stats->set_average(src.memory_utilization.average); + stats = target->mutable_pcie_tx(); + stats->set_max_value(src.pcie_tx.max_value); + stats->set_min_value(src.pcie_tx.min_value); + stats->set_average(src.pcie_tx.average); + + stats = target->mutable_pcie_rx(); + stats->set_max_value(src.pcie_rx.max_value); + stats->set_min_value(src.pcie_rx.min_value); + stats->set_average(src.pcie_rx.average); + + stats = target->mutable_memory_clock(); + stats->set_max_value(src.memory_clock.max_value); + stats->set_min_value(src.memory_clock.min_value); + stats->set_average(src.memory_clock.average); + + stats = target->mutable_gpu_temperature(); + stats->set_max_value(src.gpu_temperature.max_value); + stats->set_min_value(src.gpu_temperature.min_value); + stats->set_average(src.gpu_temperature.average); return true; }