Support extra metrics in the RDC

Remove the * in the rdci stats
When a group is created, the GPUs can be added in the same command.
Add the support to the memory temperature.
Add the support to the memory clock.
Add the support to report the ECC errors.
Add the support to report the PCIe bandwidth throughput.

Since the RX/TX throughput may take 1 second to retreive, an async fetch is implemented
in the RdcMetricFetcherImpl.

Change-Id: If04f602fe1f2d14dbf7c2fb189549fd030523f9a
This commit is contained in:
Bill(Shuzhou) Liu
2020-04-29 10:32:50 -04:00
zatwierdzone przez Chris Freehill
rodzic 096dc2dadb
commit f4a3fd4dda
20 zmienionych plików z 601 dodań i 62 usunięć
+38
Wyświetl plik
@@ -144,16 +144,48 @@ typedef enum {
*/
#define RDC_FI_GPU_SM_CLOCK 100
/**
* Clock for the memory
*/
#define RDC_FI_MEM_CLOCK 101
/**
* PCIe Tx utilization information
*/
#define RDC_FI_PCIE_TX 200
/**
* PCIe Rx utilization information
*/
#define RDC_FI_PCIE_RX 201
/**
* GPU Utilization
*/
#define RDC_FI_GPU_UTIL 203
/**
* Accumulated correctable ECC errors
*/
#define RDC_FI_ECC_CORRECT_TOTAL 312
/**
* Accumulated uncorrectable ECC errors
*/
#define RDC_FI_ECC_UNCORRECT_TOTAL 313
/**
* Memory temperature for the device
*/
#define RDC_FI_MEMORY_TEMP 140
/**
* Current temperature for the device
*/
#define RDC_FI_GPU_TEMP 150
/**
* GPU count in the system
*/
@@ -209,9 +241,15 @@ typedef struct {
uint64_t end_time; //!< The time to stop the watching
uint64_t energy_consumed;
uint64_t ecc_correct;
uint64_t ecc_uncorrect;
rdc_stats_summary_t pcie_tx;
rdc_stats_summary_t pcie_rx;
rdc_stats_summary_t power_usage;
rdc_stats_summary_t gpu_clock;
rdc_stats_summary_t memory_clock;
rdc_stats_summary_t gpu_utilization;
rdc_stats_summary_t gpu_temperature;
uint64_t max_gpu_memory_used;
rdc_stats_summary_t memory_utilization;
+5 -6
Wyświetl plik
@@ -32,7 +32,6 @@ THE SOFTWARE.
namespace amd {
namespace rdc {
typedef std::map<uint32_t, uint64_t> rdc_gpu_total_memory_t;
class RdcCacheManager {
public:
@@ -48,12 +47,14 @@ class RdcCacheManager {
virtual std::string get_cache_stats() = 0;
virtual rdc_status_t rdc_job_get_stats(char jobId[64],
const rdc_gpu_total_memory_t& total_memory,
const rdc_gpu_gauges_t& gpu_gauges,
rdc_job_info_t* p_job_info) = 0;
virtual rdc_status_t rdc_job_start_stats(char jobId[64],
const rdc_group_info_t& group,
const rdc_field_group_info_t& finfo) = 0;
virtual rdc_status_t rdc_job_stop_stats(char job_id[64]) = 0;
const rdc_field_group_info_t& finfo,
const rdc_gpu_gauges_t& gpu_gauges) = 0;
virtual rdc_status_t rdc_job_stop_stats(char job_id[64],
const rdc_gpu_gauges_t& gpu_gauge) = 0;
virtual rdc_status_t rdc_update_job_stats(uint32_t gpu_index,
const std::string& job_id, const rdc_field_value& value) = 0;
virtual rdc_status_t rdc_job_remove(char job_id[64]) = 0;
@@ -64,8 +65,6 @@ class RdcCacheManager {
typedef std::shared_ptr<RdcCacheManager> RdcCacheManagerPtr;
//<! The key to identify the field with <gpu_id, field_id>
typedef std::pair<uint32_t, uint32_t> RdcFieldKey;
} // namespace rdc
} // namespace amd
+4 -2
Wyświetl plik
@@ -36,8 +36,10 @@ class RdcWatchTable {
virtual rdc_status_t rdc_field_update_all() = 0;
virtual rdc_status_t rdc_job_start_stats(rdc_gpu_group_t group_id,
char job_id[64], uint64_t update_freq) = 0;
virtual rdc_status_t rdc_job_stop_stats(char job_id[64]) = 0;
char job_id[64], uint64_t update_freq,
const rdc_gpu_gauges_t& gpu_gauge) = 0;
virtual rdc_status_t rdc_job_stop_stats(char job_id[64],
const rdc_gpu_gauges_t& gpu_gauge) = 0;
virtual rdc_status_t rdc_job_remove(char job_id[64]) = 0;
virtual rdc_status_t rdc_job_remove_all() = 0;
@@ -53,6 +53,8 @@ struct FieldSummaryStats {
struct GpuSummaryStats {
uint64_t energy_consumed;
uint64_t energy_last_time;
uint64_t ecc_correct_init; // Init counter when job starts
uint64_t ecc_uncorrect_init; // Init counter when job starts
std::map<uint32_t, FieldSummaryStats> field_summaries;
};
@@ -80,12 +82,14 @@ class RdcCacheManagerImpl: public RdcCacheManager {
std::string get_cache_stats() override;
rdc_status_t rdc_job_get_stats(char job_id[64],
const rdc_gpu_total_memory_t& total_memory,
const rdc_gpu_gauges_t& gpu_gauges,
rdc_job_info_t* p_job_info) override;
rdc_status_t rdc_job_start_stats(char job_id[64],
const rdc_group_info_t& group,
const rdc_field_group_info_t& finfo) override;
rdc_status_t rdc_job_stop_stats(char job_id[64]) override;
const rdc_field_group_info_t& finfo,
const rdc_gpu_gauges_t& gpu_gauges) override;
rdc_status_t rdc_job_stop_stats(char job_id[64],
const rdc_gpu_gauges_t& gpu_gauge) override;
rdc_status_t rdc_update_job_stats(uint32_t gpu_index,
const std::string& job_id,
const rdc_field_value& value) override;
@@ -92,6 +92,7 @@ class RdcEmbeddedHandler: public RdcHandler {
~RdcEmbeddedHandler();
private:
rdc_status_t get_gpu_gauges(rdc_gpu_gauges_t* gpu_gauges);
RdcGroupSettingsPtr group_settings_;
RdcCacheManagerPtr cache_mgr_;
RdcMetricFetcherPtr metric_fetcher_;
@@ -22,16 +22,55 @@ THE SOFTWARE.
#ifndef RDC_LIB_IMPL_RDCMETRICFETCHERIMPL_H_
#define RDC_LIB_IMPL_RDCMETRICFETCHERIMPL_H_
#include <mutex> // NOLINT(build/c++11)
#include <future> // NOLINT(build/c++11)
#include <condition_variable> // NOLINT(build/c++11)
#include <map>
#include <queue>
#include "rdc_lib/RdcMetricFetcher.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
//!< Some metrics, like PCIe throughput may take a second to retreive. The
//!< MetricValue will cache those metrics for async retreive.
struct MetricValue {
uint64_t cache_ttl;
uint64_t last_time;
rdc_field_value value;
};
//!< The data structure to store the async fetch task
class RdcMetricFetcherImpl;
struct MetricTask {
RdcFieldKey field;
std::function<void(RdcMetricFetcherImpl&, RdcFieldKey)> task;
};
class RdcMetricFetcherImpl: public RdcMetricFetcher {
public:
rdc_status_t fetch_smi_field(uint32_t gpu_index,
uint32_t field_id, rdc_field_value* value) override;
bool is_field_valid(uint32_t field_id) const override;
RdcMetricFetcherImpl();
~RdcMetricFetcherImpl();
private:
uint64_t now();
void get_ecc_error(uint32_t gpu_index,
uint32_t field_id, rdc_field_value* value);
void async_get_pcie_throughput(uint32_t gpu_index,
uint32_t field_id, rdc_field_value* value);
void get_pcie_throughput(const RdcFieldKey& key);
//!< Async metric retreive
std::map<RdcFieldKey, MetricValue> async_metrics_;
std::queue<MetricTask> updated_tasks_;
std::mutex task_mutex_;
std::future<void> updater_; // keep the future of updater
std::condition_variable cv_;
std::atomic<bool> task_started_;
};
} // namespace rdc
@@ -54,8 +54,10 @@ struct JobWatchTableEntry {
class RdcWatchTableImpl : public RdcWatchTable {
public:
rdc_status_t rdc_job_start_stats(rdc_gpu_group_t group_id,
char job_id[64], uint64_t update_freq) override;
rdc_status_t rdc_job_stop_stats(char job_id[64]) override;
char job_id[64], uint64_t update_freq,
const rdc_gpu_gauges_t& gpu_gauge) override;
rdc_status_t rdc_job_stop_stats(char job_id[64],
const rdc_gpu_gauges_t& gpu_gauge) override;
rdc_status_t rdc_job_remove(char job_id[64]) override;
rdc_status_t rdc_job_remove_all() override;
+8
Wyświetl plik
@@ -23,6 +23,8 @@ THE SOFTWARE.
#ifndef RDC_LIB_RDC_COMMON_H_
#define RDC_LIB_RDC_COMMON_H_
#include <iostream>
#include <map>
#include <utility>
#define RDC_ERROR 0
#define RDC_INFO 1
@@ -37,6 +39,12 @@ THE SOFTWARE.
} \
} while (0)
//<! The key to identify the field with <gpu_id, field_id>
typedef std::pair<uint32_t, uint32_t> RdcFieldKey;
//!< The gauge metrics do not require aggregations
typedef std::map<RdcFieldKey, uint64_t> rdc_gpu_gauges_t;
/**
* @brief The strncpy but with null terminated
*
+6
Wyświetl plik
@@ -427,6 +427,12 @@ message GpuUsageInfo {
JobStatsSummary gpu_utilization = 7;
uint64 max_gpu_memory_used = 8;
JobStatsSummary memory_utilization = 9;
uint64 ecc_correct = 10;
uint64 ecc_uncorrect = 11;
JobStatsSummary pcie_tx = 12;
JobStatsSummary pcie_rx = 13;
JobStatsSummary memory_clock = 14;
JobStatsSummary gpu_temperature = 15;
}
message GetJobStatsResponse {
uint32 status = 1;
@@ -358,6 +358,12 @@ const char* field_id_string(uint32_t field_id) {
{RDC_FI_GPU_UTIL, "GPU_UTIL"},
{RDC_FI_GPU_TEMP, "GPU_TEMP"},
{RDC_FI_GPU_COUNT, "GPU_COUNT"},
{RDC_FI_MEM_CLOCK, "MEM_CLOCK"},
{RDC_FI_PCIE_TX, "PCIE_TX"},
{RDC_FI_PCIE_RX, "PCIE_RX"},
{RDC_FI_ECC_CORRECT_TOTAL, "ECC_CORRECT"},
{RDC_FI_ECC_UNCORRECT_TOTAL, "ECC_UNCORRECT"},
{RDC_FI_MEMORY_TEMP, "MEMORY_TEMP"},
{RDC_FI_DEV_NAME, "DEV_NAME"}
};
+103 -5
Wyświetl plik
@@ -235,7 +235,11 @@ rdc_status_t RdcCacheManagerImpl::rdc_update_job_stats(uint32_t gpu_index,
void RdcCacheManagerImpl::set_summary(const FieldSummaryStats & stats,
rdc_stats_summary_t & gpu, rdc_stats_summary_t& summary,
unsigned int adjuster) {
if (stats.count == 0) return;
if (stats.count == 0) {
gpu.min_value = std::numeric_limits<uint64_t>::max();
gpu.max_value = gpu.average = 0;
return;
}
gpu.max_value = stats.max_value / adjuster;
gpu.min_value = stats.min_value / adjuster;
@@ -247,7 +251,7 @@ void RdcCacheManagerImpl::set_summary(const FieldSummaryStats & stats,
}
rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(char jobId[64],
const rdc_gpu_total_memory_t& total_memory,
const rdc_gpu_gauges_t& gpu_gauges,
rdc_job_info_t* p_job_info) {
std::lock_guard<std::mutex> guard(cache_mutex_);
auto job_stats = cache_jobs_.find(jobId);
@@ -257,6 +261,7 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(char jobId[64],
}
//< Init the summary info
bool is_job_stopped = (job_stats->second.end_time != 0);
RDC_LOG(RDC_DEBUG, "rdc_job_get_stats for job " << jobId);
auto& summary_info = p_job_info->summary;
summary_info.start_time = job_stats->second.start_time;
@@ -267,7 +272,13 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(char jobId[64],
}
summary_info.energy_consumed = 0;
summary_info.max_gpu_memory_used = 0;
summary_info.ecc_correct = 0;
summary_info.ecc_uncorrect = 0;
summary_info.power_usage = {0, std::numeric_limits<uint64_t>::max(), 0};
summary_info.pcie_tx = {0, std::numeric_limits<uint64_t>::max(), 0};
summary_info.pcie_rx = {0, std::numeric_limits<uint64_t>::max(), 0};
summary_info.gpu_temperature = {0, std::numeric_limits<uint64_t>::max(), 0};
summary_info.memory_clock = {0, std::numeric_limits<uint64_t>::max(), 0};
summary_info.gpu_clock = {0, std::numeric_limits<uint64_t>::max(), 0};
summary_info.gpu_utilization = {0, std::numeric_limits<uint64_t>::max(), 0};
summary_info.memory_utilization = {0,
@@ -285,13 +296,46 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(char jobId[64],
gpu_info.energy_consumed = gpus->second.energy_consumed;
summary_info.energy_consumed += gpu_info.energy_consumed;
if (is_job_stopped) {
gpu_info.ecc_correct = gpus->second.ecc_correct_init;
summary_info.ecc_correct += gpu_info.ecc_correct;
} else if (gpu_gauges.find({gpus->first,
RDC_FI_ECC_CORRECT_TOTAL}) != gpu_gauges.end()) {
gpu_info.ecc_correct = gpu_gauges.at({
gpus->first, RDC_FI_ECC_CORRECT_TOTAL}) -
gpus->second.ecc_correct_init;
summary_info.ecc_correct += gpu_info.ecc_correct;
} else {
gpu_info.ecc_correct = 0;
}
if (is_job_stopped) {
gpu_info.ecc_uncorrect = gpus->second.ecc_uncorrect_init;
summary_info.ecc_uncorrect += gpu_info.ecc_uncorrect;
} else if (gpu_gauges.find({gpus->first,
RDC_FI_ECC_UNCORRECT_TOTAL}) != gpu_gauges.end()) {
gpu_info.ecc_uncorrect = gpu_gauges.at({
gpus->first, RDC_FI_ECC_UNCORRECT_TOTAL}) -
gpus->second.ecc_uncorrect_init;
summary_info.ecc_uncorrect += gpu_info.ecc_uncorrect;
} else {
gpu_info.ecc_uncorrect = 0;
}
if (gpu_gauges.find({gpus->first,
RDC_FI_GPU_MEMORY_TOTAL}) == gpu_gauges.end()) {
RDC_LOG(RDC_ERROR, "Cannot find the total memory");
return RDC_ST_BAD_PARAMETER;
}
uint64_t tmemory = gpu_gauges.at({gpus->first,
RDC_FI_GPU_MEMORY_TOTAL});
auto ite = gpus->second.field_summaries.begin();
for (; ite != gpus->second.field_summaries.end(); ite++) {
if (ite->first == RDC_FI_POWER_USAGE) {
set_summary(ite->second,
gpu_info.power_usage, summary_info.power_usage, 1000000);
} else if (ite->first == RDC_FI_GPU_MEMORY_USAGE) {
auto tmemory = total_memory.at(gpus->first);
set_summary(ite->second, gpu_info.memory_utilization,
summary_info.memory_utilization, tmemory/100);
gpu_info.max_gpu_memory_used = ite->second.max_value;
@@ -304,6 +348,18 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(char jobId[64],
} else if (ite->first == RDC_FI_GPU_UTIL) {
set_summary(ite->second, gpu_info.gpu_utilization,
summary_info.gpu_utilization, 1);
} else if (ite->first == RDC_FI_GPU_TEMP) {
set_summary(ite->second,
gpu_info.gpu_temperature, summary_info.gpu_temperature, 1000);
} else if (ite->first == RDC_FI_MEM_CLOCK) {
set_summary(ite->second,
gpu_info.memory_clock, summary_info.memory_clock, 1000000);
} else if (ite->first == RDC_FI_PCIE_TX) {
set_summary(ite->second,
gpu_info.pcie_tx, summary_info.pcie_tx, 1024*1024);
} else if (ite->first == RDC_FI_PCIE_RX) {
set_summary(ite->second,
gpu_info.pcie_rx, summary_info.pcie_rx, 1024*1024);
}
}
}
@@ -316,12 +372,21 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(char jobId[64],
p_job_info->num_gpus;
summary_info.memory_utilization.average =
summary_info.memory_utilization.average/p_job_info->num_gpus;
summary_info.pcie_tx.average = summary_info.pcie_tx.average/
p_job_info->num_gpus;
summary_info.pcie_rx.average = summary_info.pcie_rx.average/
p_job_info->num_gpus;
summary_info.gpu_temperature.average = summary_info.gpu_temperature.average/
p_job_info->num_gpus;
summary_info.memory_clock.average = summary_info.memory_clock.average/
p_job_info->num_gpus;
return RDC_ST_OK;
}
rdc_status_t RdcCacheManagerImpl::rdc_job_start_stats(char job_id[64],
const rdc_group_info_t& ginfo, const rdc_field_group_info_t& finfo) {
const rdc_group_info_t& ginfo, const rdc_field_group_info_t& finfo,
const rdc_gpu_gauges_t& gpu_gauges) {
RdcJobStatsCacheEntry cacheEntry;
cacheEntry.start_time = std::time(nullptr);
cacheEntry.end_time = 0;
@@ -336,6 +401,20 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_start_stats(char job_id[64],
gstats.field_summaries.insert({finfo.field_ids[j], s});
}
gstats.ecc_correct_init = 0;
if (gpu_gauges.find({ginfo.entity_ids[i], RDC_FI_ECC_CORRECT_TOTAL}) !=
gpu_gauges.end()) {
gstats.ecc_correct_init = gpu_gauges.at(
{ginfo.entity_ids[i], RDC_FI_ECC_CORRECT_TOTAL});
}
gstats.ecc_uncorrect_init = 0;
if (gpu_gauges.find({ginfo.entity_ids[i], RDC_FI_ECC_UNCORRECT_TOTAL}) !=
gpu_gauges.end()) {
gstats.ecc_uncorrect_init = gpu_gauges.at(
{ginfo.entity_ids[i], RDC_FI_ECC_UNCORRECT_TOTAL});
}
cacheEntry.gpu_stats.insert({ginfo.entity_ids[i], gstats});
}
@@ -347,7 +426,8 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_start_stats(char job_id[64],
}
rdc_status_t RdcCacheManagerImpl::rdc_job_stop_stats(char job_id[64]) {
rdc_status_t RdcCacheManagerImpl::rdc_job_stop_stats(char job_id[64],
const rdc_gpu_gauges_t& gpu_gauges) {
std::lock_guard<std::mutex> guard(cache_mutex_);
auto job_stats = cache_jobs_.find(job_id);
@@ -357,6 +437,24 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_stop_stats(char job_id[64]) {
job_stats->second.end_time = std::time(nullptr);
// update the ecc errors
auto gpus = job_stats->second.gpu_stats.begin();
for (; gpus != job_stats->second.gpu_stats.end(); gpus++) {
if (gpu_gauges.find({gpus->first,
RDC_FI_ECC_CORRECT_TOTAL}) != gpu_gauges.end()) {
gpus->second.ecc_correct_init = gpu_gauges.at({
gpus->first, RDC_FI_ECC_CORRECT_TOTAL}) -
gpus->second.ecc_correct_init;
}
if (gpu_gauges.find({gpus->first,
RDC_FI_ECC_UNCORRECT_TOTAL}) != gpu_gauges.end()) {
gpus->second.ecc_uncorrect_init = gpu_gauges.at({
gpus->first, RDC_FI_ECC_UNCORRECT_TOTAL}) -
gpus->second.ecc_uncorrect_init;
}
}
return RDC_ST_OK;
}
+47 -10
Wyświetl plik
@@ -86,38 +86,75 @@ RdcEmbeddedHandler::~RdcEmbeddedHandler() {
// JOB API
rdc_status_t RdcEmbeddedHandler::rdc_job_start_stats(rdc_gpu_group_t groupId,
char job_id[64], uint64_t update_freq) {
return watch_table_->rdc_job_start_stats(groupId, job_id, update_freq);
rdc_gpu_gauges_t gpu_gauges;
rdc_status_t status = get_gpu_gauges(&gpu_gauges);
if (status != RDC_ST_OK) return status;
return watch_table_->rdc_job_start_stats(groupId, job_id, update_freq,
gpu_gauges);
}
rdc_status_t RdcEmbeddedHandler::rdc_job_get_stats(char job_id[64],
rdc_job_info_t* p_job_info) {
rdc_status_t RdcEmbeddedHandler::get_gpu_gauges(rdc_gpu_gauges_t* gpu_gauges) {
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES];
uint32_t count = 0;
if (gpu_gauges == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
rdc_status_t status = rdc_device_get_all(
gpu_index_list, &count);
if (status != RDC_ST_OK) {
return status;
}
rdc_gpu_total_memory_t all_total_memory;
// Fetch total memory and current ecc errors
for (uint32_t i = 0; i < count ; i++) {
rdc_field_value total_memory;
rdc_field_value value;
status = metric_fetcher_->fetch_smi_field(gpu_index_list[i],
RDC_FI_GPU_MEMORY_TOTAL, &total_memory);
RDC_FI_GPU_MEMORY_TOTAL, &value);
if (status != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Fail to get total memory of GPU "
<< gpu_index_list[i]);
return status;
}
all_total_memory.insert({gpu_index_list[i], total_memory.value.l_int});
gpu_gauges->insert({{gpu_index_list[i], RDC_FI_GPU_MEMORY_TOTAL},
value.value.l_int});
status = metric_fetcher_->fetch_smi_field(gpu_index_list[i],
RDC_FI_ECC_CORRECT_TOTAL, &value);
if (status == RDC_ST_OK) {
gpu_gauges->insert({{gpu_index_list[i], RDC_FI_ECC_CORRECT_TOTAL},
value.value.l_int});
}
status = metric_fetcher_->fetch_smi_field(gpu_index_list[i],
RDC_FI_ECC_UNCORRECT_TOTAL, &value);
if (status == RDC_ST_OK) {
gpu_gauges->insert({{gpu_index_list[i], RDC_FI_ECC_UNCORRECT_TOTAL},
value.value.l_int});
}
}
return RDC_ST_OK;
}
rdc_status_t RdcEmbeddedHandler::rdc_job_get_stats(char job_id[64],
rdc_job_info_t* p_job_info) {
if (p_job_info == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
return cache_mgr_->rdc_job_get_stats(job_id, all_total_memory, p_job_info);
rdc_gpu_gauges_t gpu_gauges;
rdc_status_t status = get_gpu_gauges(&gpu_gauges);
if (status != RDC_ST_OK) return status;
return cache_mgr_->rdc_job_get_stats(job_id, gpu_gauges, p_job_info);
}
rdc_status_t RdcEmbeddedHandler::rdc_job_stop_stats(char job_id[64]) {
return watch_table_->rdc_job_stop_stats(job_id);
rdc_gpu_gauges_t gpu_gauges;
rdc_status_t status = get_gpu_gauges(&gpu_gauges);
if (status != RDC_ST_OK) return status;
return watch_table_->rdc_job_stop_stats(job_id, gpu_gauges);
}
rdc_status_t RdcEmbeddedHandler::rdc_job_remove(char job_id[64]) {
@@ -30,7 +30,9 @@ namespace rdc {
RdcGroupSettingsImpl::RdcGroupSettingsImpl() {
// Add the default job stats fields
uint32_t job_fields[] = {RDC_FI_GPU_MEMORY_USAGE,
RDC_FI_POWER_USAGE, RDC_FI_GPU_SM_CLOCK, RDC_FI_GPU_UTIL};
RDC_FI_POWER_USAGE, RDC_FI_GPU_SM_CLOCK, RDC_FI_GPU_UTIL,
RDC_FI_PCIE_TX, RDC_FI_PCIE_RX, RDC_FI_MEM_CLOCK,
RDC_FI_GPU_TEMP};
char job_field_group[] = "JobStatsFields";
rdc_field_grp_t fgid = JOB_FIELD_ID;
+198 -9
Wyświetl plik
@@ -35,18 +35,192 @@ namespace rdc {
bool RdcMetricFetcherImpl::is_field_valid(uint32_t field_id) const {
const std::vector<uint32_t> all_fields = {RDC_FI_GPU_MEMORY_USAGE,
RDC_FI_GPU_MEMORY_TOTAL, RDC_FI_GPU_COUNT, RDC_FI_POWER_USAGE,
RDC_FI_GPU_SM_CLOCK, RDC_FI_GPU_UTIL, RDC_FI_DEV_NAME, RDC_FI_GPU_TEMP};
RDC_FI_GPU_SM_CLOCK, RDC_FI_GPU_UTIL, RDC_FI_DEV_NAME, RDC_FI_GPU_TEMP,
RDC_FI_MEM_CLOCK, RDC_FI_PCIE_TX, RDC_FI_PCIE_RX,
RDC_FI_ECC_CORRECT_TOTAL, RDC_FI_ECC_UNCORRECT_TOTAL, RDC_FI_MEMORY_TEMP};
return std::find(all_fields.begin(), all_fields.end(), field_id)
!= all_fields.end();
}
RdcMetricFetcherImpl::RdcMetricFetcherImpl() {
task_started_ = true;
// kick off another thread for async fetch
updater_ = std::async(std::launch::async, [this]() {
while (task_started_) {
std::unique_lock<std::mutex> lk(task_mutex_);
// Wait for tasks or stop signal
cv_.wait(lk, [this]{
return !updated_tasks_.empty() || !task_started_;
});
if (updated_tasks_.empty()) continue;
// Get the tasks
auto item = updated_tasks_.front();
updated_tasks_.pop();
// The task may take long time, release lock
lk.unlock();
// run task
item.task(*this, item.field);
} // end while (task_started_)
});
}
RdcMetricFetcherImpl::~RdcMetricFetcherImpl() {
// Notify the async task to stop
task_started_ = false;
cv_.notify_all();
}
uint64_t RdcMetricFetcherImpl::now() {
struct timeval tv;
gettimeofday(&tv, NULL);
return static_cast<uint64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
}
void RdcMetricFetcherImpl::get_ecc_error(uint32_t gpu_index,
uint32_t field_id, rdc_field_value* value) {
rsmi_status_t err = RSMI_STATUS_SUCCESS;
uint64_t correctable_err = 0;
uint64_t uncorrectable_err = 0;
rsmi_ras_err_state_t err_state;
if (!value) {
return;
}
for (uint32_t b = RSMI_GPU_BLOCK_FIRST;
b <= RSMI_GPU_BLOCK_LAST; b = b*2) {
err = rsmi_dev_ecc_status_get(gpu_index, static_cast<rsmi_gpu_block_t>(b),
&err_state);
if (err != RSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_INFO, "Get the ecc Status error " << b
<< ":" << err);
continue;
}
rsmi_error_count_t ec;
err = rsmi_dev_ecc_count_get(gpu_index,
static_cast<rsmi_gpu_block_t>(b), &ec);
if (err == RSMI_STATUS_SUCCESS) {
correctable_err += ec.correctable_err;
uncorrectable_err += ec.uncorrectable_err;
}
}
value->status = RSMI_STATUS_SUCCESS;
value->type = INTEGER;
if (field_id == RDC_FI_ECC_CORRECT_TOTAL) {
value->value.l_int = correctable_err;
}
if (field_id == RDC_FI_ECC_UNCORRECT_TOTAL) {
value->value.l_int = uncorrectable_err;
}
}
void RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index,
uint32_t field_id, rdc_field_value* value) {
if (!value) {
return;
}
do {
std::lock_guard<std::mutex> guard(task_mutex_);
auto metric = async_metrics_.find({gpu_index, field_id});
if ( metric != async_metrics_.end() ) {
if (now() < metric->second.last_time + metric->second.cache_ttl) {
RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" <<
field_id_string(field_id) << " from cache");
value->status = metric->second.value.status;
value->type = metric->second.value.type;
value->value = metric->second.value.value;
return;
}
}
// add to the async task queue
MetricTask t;
t.field = {gpu_index, field_id};
t.task = &RdcMetricFetcherImpl::get_pcie_throughput;
updated_tasks_.push(t);
RDC_LOG(RDC_DEBUG, "Start async fetch " << gpu_index << ":" <<
field_id_string(field_id) << " to cache.");
} while (0);
cv_.notify_all();
}
void RdcMetricFetcherImpl::get_pcie_throughput(const RdcFieldKey& key) {
uint32_t gpu_index = key.first;
uint64_t sent, received, max_pkt_sz;
rsmi_status_t ret;
// Return if the cache does not expire yet
do {
std::lock_guard<std::mutex> guard(task_mutex_);
auto metric = async_metrics_.find(key);
if (metric != async_metrics_.end() &&
now() < metric->second.last_time + metric->second.cache_ttl) {
return;
}
} while (0);
ret = rsmi_dev_pci_throughput_get(gpu_index, &sent, &received, &max_pkt_sz);
uint64_t curTime = now();
MetricValue value;
value.cache_ttl = 30*1000; // cache 30 seconds
value.value.type = INTEGER;
do {
std::lock_guard<std::mutex> guard(task_mutex_);
// Create new cache entry it does not exist
auto tx_metric = async_metrics_.find({gpu_index, RDC_FI_PCIE_TX});
if (tx_metric == async_metrics_.end()) {
tx_metric = async_metrics_.insert(
{{gpu_index, RDC_FI_PCIE_TX}, value}).first;
tx_metric->second.value.field_id = RDC_FI_PCIE_TX;
}
auto rx_metric = async_metrics_.find({gpu_index, RDC_FI_PCIE_RX});
if (rx_metric == async_metrics_.end()) {
rx_metric = async_metrics_.insert(
{{gpu_index, RDC_FI_PCIE_RX}, value}).first;
rx_metric->second.value.field_id = RDC_FI_PCIE_RX;
}
// Always update the status and last_time
tx_metric->second.last_time = curTime;
tx_metric->second.value.status = ret;
tx_metric->second.value.ts = curTime;
rx_metric->second.last_time = curTime;
rx_metric->second.value.status = ret;
rx_metric->second.value.ts = curTime;
if (ret == RSMI_STATUS_NOT_SUPPORTED) {
RDC_LOG(RDC_ERROR,
"PCIe throughput not supported on GPU " << gpu_index);
return;
}
if (ret == RSMI_STATUS_SUCCESS) {
rx_metric->second.value.value.l_int = received;
tx_metric->second.value.value.l_int = sent;
RDC_LOG(RDC_DEBUG, "Async updated " << gpu_index << ":" <<
"RDC_FI_PCIE_RX and RDC_FI_PCIE_TX to cache.");
}
} while (0);
}
rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index,
uint32_t field_id, rdc_field_value* value) {
if (!value) {
return RDC_ST_BAD_PARAMETER;
}
uint64_t i64 = 0;
rsmi_temperature_type_t sensor_type;
rsmi_clk_type_t clk_type;
if (!is_field_valid(field_id)) {
RDC_LOG(RDC_ERROR, "Fail to fetch field " << field_id
@@ -54,9 +228,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index,
return RDC_ST_NOT_SUPPORTED;
}
struct timeval tv;
gettimeofday(&tv, NULL);
value->ts = static_cast<uint64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
value->ts = now();
value->field_id = field_id;
value->status = RSMI_STATUS_NOT_SUPPORTED;
@@ -94,9 +266,14 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index,
}
break;
case RDC_FI_GPU_SM_CLOCK:
case RDC_FI_MEM_CLOCK:
rsmi_frequencies_t f;
clk_type = RSMI_CLK_TYPE_SYS;
if (field_id == RDC_FI_MEM_CLOCK) {
clk_type = RSMI_CLK_TYPE_MEM;
}
value->status = rsmi_dev_gpu_clk_freq_get(gpu_index,
RSMI_CLK_TYPE_SYS, &f);
clk_type, &f);
value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = f.frequency[f.current];
@@ -116,21 +293,33 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index,
value->type = STRING;
break;
case RDC_FI_GPU_TEMP:
case RDC_FI_MEMORY_TEMP:
int64_t val_i64;
sensor_type = RSMI_TEMP_TYPE_EDGE;
if (field_id == RDC_FI_MEMORY_TEMP) {
sensor_type = RSMI_TEMP_TYPE_MEMORY;
}
value->status = rsmi_dev_temp_metric_get(gpu_index,
0, RSMI_TEMP_CURRENT, &val_i64);
sensor_type , RSMI_TEMP_CURRENT, &val_i64);
value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = val_i64;
}
break;
case RDC_FI_ECC_CORRECT_TOTAL:
case RDC_FI_ECC_UNCORRECT_TOTAL:
get_ecc_error(gpu_index, field_id, value);
break;
case RDC_FI_PCIE_TX:
case RDC_FI_PCIE_RX:
async_get_pcie_throughput(gpu_index, field_id, value);
break;
default:
break;
}
gettimeofday(&tv, NULL);
int64_t latency = static_cast<uint64_t>(tv.tv_sec)*1000+tv.tv_usec/1000
- value->ts;
int64_t latency = now()-value->ts;
if (value->status != RSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "Fail to fetch " << gpu_index << ":" <<
field_id_string(field_id) << " with rsmi error code "
+21 -11
Wyświetl plik
@@ -41,7 +41,8 @@ RdcWatchTableImpl::RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings,
}
rdc_status_t RdcWatchTableImpl::rdc_job_start_stats(rdc_gpu_group_t group_id,
char job_id[64], uint64_t update_freq) {
char job_id[64], uint64_t update_freq,
const rdc_gpu_gauges_t& gpu_gauges) {
do { //< lock guard for thread safe
std::lock_guard<std::mutex> guard(watch_mutex_);
if (job_watch_table_.find(job_id) != job_watch_table_.end()) {
@@ -67,10 +68,6 @@ rdc_status_t RdcWatchTableImpl::rdc_job_start_stats(rdc_gpu_group_t group_id,
job_watch_table_.insert({job_id, jentry});
} while (0);
result = rdc_field_watch(group_id, JOB_FIELD_ID, update_freq, 0, 0);
if (result != RDC_ST_OK) {
return result;
}
rdc_field_group_info_t finfo;
rdc_group_info_t ginfo;
@@ -85,12 +82,18 @@ rdc_status_t RdcWatchTableImpl::rdc_job_start_stats(rdc_gpu_group_t group_id,
return result;
}
result = cache_mgr_->rdc_job_start_stats(job_id, ginfo, finfo);
result = cache_mgr_->rdc_job_start_stats(job_id, ginfo, finfo, gpu_gauges);
if (result != RDC_ST_OK) {
return result;
}
// At last, when every thing sets up, starts to watch the fields.
result = rdc_field_watch(group_id, JOB_FIELD_ID, update_freq, 0, 0);
return result;
}
rdc_status_t RdcWatchTableImpl::rdc_job_stop_stats(char job_id[64]) {
rdc_status_t RdcWatchTableImpl::rdc_job_stop_stats(char job_id[64],
const rdc_gpu_gauges_t& gpu_gauge) {
uint32_t job_group_id;
do { //< lock guard for thread safe
std::lock_guard<std::mutex> guard(watch_mutex_);
@@ -111,13 +114,14 @@ rdc_status_t RdcWatchTableImpl::rdc_job_stop_stats(char job_id[64]) {
job_watch_table_.erase(job_id);
} while (0);
result = cache_mgr_->rdc_job_stop_stats(job_id);
result = cache_mgr_->rdc_job_stop_stats(job_id, gpu_gauge);
return result;
}
rdc_status_t RdcWatchTableImpl::rdc_job_remove(char job_id[64]) {
rdc_job_stop_stats(job_id);
rdc_gpu_gauges_t gpu_gauge;
rdc_job_stop_stats(job_id, gpu_gauge);
return cache_mgr_->rdc_job_remove(job_id);
}
@@ -134,7 +138,8 @@ rdc_status_t RdcWatchTableImpl::rdc_job_remove_all() {
// Stop them
for (auto job = v.begin(); job != v.end(); job++) {
rdc_job_stop_stats(const_cast<char*>(job->c_str()));
rdc_gpu_gauges_t gpu_gauge;
rdc_job_stop_stats(const_cast<char*>(job->c_str()), gpu_gauge);
}
return cache_mgr_->rdc_job_remove_all();
@@ -340,8 +345,9 @@ rdc_status_t RdcWatchTableImpl::rdc_field_update_all() {
auto fite = fields_to_watch_.begin();
for (; fite != fields_to_watch_.end(); fite++) {
// Is this field need to be updated?
uint64_t track_freq = fite->second.update_freq/1000;
if (!fite->second.is_watching ||
fite->second.last_update_time+fite->second.update_freq/1000 > now) {
fite->second.last_update_time+track_freq > now) {
continue;
}
@@ -350,6 +356,10 @@ rdc_status_t RdcWatchTableImpl::rdc_field_update_all() {
result = metric_fetcher_->fetch_smi_field(
fite->first.first, fite->first.second, &value);
if (result != RDC_ST_OK) {
// To prevent frequently retry when error, update the time
gettimeofday(&tv, NULL);
now = static_cast<uint64_t>(tv.tv_sec)*1000+tv.tv_usec/1000;
fite->second.last_update_time = now;
continue;
}
@@ -88,6 +88,8 @@ bool RdcStandaloneHandler::copy_gpu_usage_info(
target->end_time = src.end_time();
target->energy_consumed = src.energy_consumed();
target->max_gpu_memory_used = src.max_gpu_memory_used();
target->ecc_correct = src.ecc_correct();
target->ecc_uncorrect = src.ecc_uncorrect();
const ::rdc::JobStatsSummary& pstats = src.power_usage();
target->power_usage.max_value = pstats.max_value();
@@ -109,6 +111,26 @@ bool RdcStandaloneHandler::copy_gpu_usage_info(
target->memory_utilization.min_value = mstats.min_value();
target->memory_utilization.average = mstats.average();
const ::rdc::JobStatsSummary& txstats = src.pcie_tx();
target->pcie_tx.max_value = txstats.max_value();
target->pcie_tx.min_value = txstats.min_value();
target->pcie_tx.average = txstats.average();
const ::rdc::JobStatsSummary& rxstats = src.pcie_rx();
target->pcie_rx.max_value = rxstats.max_value();
target->pcie_rx.min_value = rxstats.min_value();
target->pcie_rx.average = rxstats.average();
const ::rdc::JobStatsSummary& mcstats = src.memory_clock();
target->memory_clock.max_value = mcstats.max_value();
target->memory_clock.min_value = mcstats.min_value();
target->memory_clock.average = mcstats.average();
const ::rdc::JobStatsSummary& gtstats = src.gpu_temperature();
target->gpu_temperature.max_value = gtstats.max_value();
target->gpu_temperature.min_value = gtstats.min_value();
target->gpu_temperature.average = gtstats.average();
return true;
}
rdc_status_t RdcStandaloneHandler::rdc_job_get_stats(char job_id[64],
+11 -2
Wyświetl plik
@@ -278,10 +278,19 @@ void RdciDmonSubSystem::create_temp_field_group() {
void RdciDmonSubSystem::show_field_usage() const {
std::cout << "Supported fields Ids:\n";
std::cout << "100 RDC_FI_GPU_SM_CLOCK: Current GPU clock frequencies.\n";
std::cout << "101 RDC_FI_MEM_CLOCK: Current Memory clock frequencies.\n";
std::cout << "140 RDC_FI_MEMORY_TEMP: Memory "
<< "temperature in millidegrees Celsius.\n";
std::cout << "150 RDC_FI_GPU_TEMP: GPU "
<< "temperature in millidegrees Celcius.\n";
<< "temperature in millidegrees Celsius.\n";
std::cout << "155 RDC_FI_POWER_USAGE: Power usage in microwatts.\n";
std::cout << "200 RDC_FI_PCIE_TX: PCIe Tx utilization in bytes/second.\n";
std::cout << "201 RDC_FI_PCIE_RX: PCIe Rx utilization in bytes/second.\n";
std::cout << "203 RDC_FI_GPU_UTIL: GPU busy percentage.\n";
std::cout << "312 RDC_FI_ECC_CORRECT_TOTAL: Accumulated "
<< "correctable ECC errors.\n";
std::cout << "313 RDC_FI_ECC_UNCORRECT_TOTAL: Accumulated "
<< "uncorrectable ECC errors.\n";
std::cout << "525 RDC_FI_GPU_MEMORY_USAGE: Memory usage of the GPU "
<< "instance in bytes.\n";
}
@@ -361,7 +370,7 @@ void RdciDmonSubSystem::process() {
group_info.entity_ids[gindex],
field_info.field_ids[findex], &value);
if (result != RDC_ST_OK) {
std::cout << std::left << std::setw(20) << "error";
std::cout << std::left << std::setw(20) << "N/A";
} else {
if (value.type == INTEGER) {
std::cout << std::left << std::setw(20)
+26 -3
Wyświetl plik
@@ -82,7 +82,10 @@ void RdciGroupSubSystem::parse_cmd_opts(int argc, char ** argv) {
group_name_ = optarg;
break;
case 'a':
group_ops_ = GROUP_ADD_GPUS;
// Create may add GPUs as well.
if (group_ops_ != GROUP_CREATE) {
group_ops_ = GROUP_ADD_GPUS;
}
gpu_ids_ = optarg;
break;
case 'i':
@@ -116,7 +119,8 @@ void RdciGroupSubSystem::show_help() const {
std::cout << " group -- Used to create and maintain groups of GPUs.\n\n";
std::cout << "Usage\n";
std::cout << " rdci group [--host <IP/FQDN>:port] [-u] -l\n";
std::cout << " rdci group [--host <IP/FQDN>:port] [-u] -c <groupName>\n";
std::cout << " rdci group [--host <IP/FQDN>:port] [-u] -c <groupName> "
<< "[-a <entityId>]\n";
std::cout << " rdci group [--host <IP/FQDN>:port] [-u] -g <groupId> "
<< "[-a <entityId>]\n";
std::cout << " rdci group [--host <IP/FQDN>:port] [-u] "
@@ -157,6 +161,25 @@ void RdciGroupSubSystem::process() {
rdc_gpu_group_t group_id;
result = rdc_group_gpu_create(rdc_handle_, RDC_GROUP_EMPTY,
group_name_.c_str(), &group_id);
if (result != RDC_ST_OK) {
throw RdcException(result, "Fail to create group "
+ group_name_);
}
gpu_ids = split_string(gpu_ids_, ',');
for (uint32_t i = 0; i < gpu_ids.size(); i++) {
if (!IsNumber(gpu_ids[i])) {
throw RdcException(RDC_ST_BAD_PARAMETER,
"The GPU Id "+gpu_ids[i]+" needs to be a number");
}
result = rdc_group_gpu_add(rdc_handle_,
group_id, std::stoi(gpu_ids[i]));
if (result != RDC_ST_OK) {
throw RdcException(result, "Fail to add GPU "
+ gpu_ids[i] + " to the group");
}
}
if (result == RDC_ST_OK) {
std::cout << "Successfully created group with a group ID "
<< group_id << std::endl;
@@ -214,7 +237,7 @@ void RdciGroupSubSystem::process() {
for (uint32_t i = 0; i < gpu_ids.size(); i++) {
if (!IsNumber(gpu_ids[i])) {
throw RdcException(RDC_ST_BAD_PARAMETER,
"The GUP Id "+gpu_ids[i]+" needs to be a number");
"The GPU Id "+gpu_ids[i]+" needs to be a number");
}
result = rdc_group_gpu_add(rdc_handle_,
group_id_, std::stoi(gpu_ids[i]));
+26 -6
Wyświetl plik
@@ -150,11 +150,11 @@ void RdciStatsSubSystem::show_job_stats(
const rdc_gpu_usage_info_t& gpu_info) const {
std::cout << "|------- Execution Stats ----------"
<< "+------------------------------------\n";
std::cout << "| Start Time * | "
std::cout << "| Start Time | "
<< gpu_info.start_time << "\n";
std::cout << "| End Time * | "
std::cout << "| End Time | "
<< gpu_info.end_time << "\n";
std::cout << "| Total Execution Time (sec) * | "
std::cout << "| Total Execution Time (sec) | "
<< (gpu_info.end_time-gpu_info.start_time) << "\n";
std::cout << "+------- Performance Stats --------"
<< "+------------------------------------\n";
@@ -168,16 +168,36 @@ void RdciStatsSubSystem::show_job_stats(
<< gpu_info.gpu_clock.max_value << " Min: " <<
gpu_info.gpu_clock.min_value << " Avg: "
<< gpu_info.gpu_clock.average << "\n";
std::cout << "| Memory Clock (MHz) | " << "Max: "
<< gpu_info.memory_clock.max_value << " Min: " <<
gpu_info.memory_clock.min_value << " Avg: "
<< gpu_info.memory_clock.average << "\n";
std::cout << "| SM Utilization (%) | " << "Max: "
<< gpu_info.gpu_utilization.max_value <<" Min: " <<
gpu_info.gpu_utilization.min_value << " Avg: " <<
gpu_info.gpu_utilization.average << "\n";
std::cout << "| Max GPU Memory Used (bytes) * | " <<
std::cout << "| Max GPU Memory Used (bytes) | " <<
gpu_info.max_gpu_memory_used << "\n";
std::cout << "| Memory Utilization (%) | "
<< "Max: " << gpu_info.memory_utilization.max_value
<<" Min: "<< gpu_info.memory_utilization.min_value
<< " Avg: " << gpu_info.memory_utilization.average << "\n";
<<" Min: "<< gpu_info.memory_utilization.min_value
<< " Avg: " << gpu_info.memory_utilization.average << "\n";
std::cout << "| GPU Temperature (Celsius) | "
<< "Max: " << gpu_info.gpu_temperature.max_value
<<" Min: "<< gpu_info.gpu_temperature.min_value
<< " Avg: " << gpu_info.gpu_temperature.average << "\n";
std::cout << "| PCIe Rx Bandwidth (megabytes) | "
<< "Max: " << gpu_info.pcie_rx.max_value
<<" Min: "<< gpu_info.pcie_rx.min_value
<< " Avg: " << gpu_info.pcie_rx.average << "\n";
std::cout << "| PCIe Tx Bandwidth (megabytes) | "
<< "Max: " << gpu_info.pcie_tx.max_value
<<" Min: "<< gpu_info.pcie_tx.min_value
<< " Avg: " << gpu_info.pcie_tx.average << "\n";
std::cout << "| Correctable ECC Errors | "
<< gpu_info.ecc_correct << "\n";
std::cout << "| Uncorrectable ECC Errors | "
<< gpu_info.ecc_uncorrect << "\n";
std::cout << "+----------------------------------"
<< "+------------------------------------\n";
}
+26 -2
Wyświetl plik
@@ -453,6 +453,11 @@ RdcAPIServiceImpl::~RdcAPIServiceImpl() {
const_cast<char*>(request->job_id().c_str()),
&job_info);
reply->set_status(result);
if (result != RDC_ST_OK) {
return ::grpc::Status::OK;
}
reply->set_num_gpus(job_info.num_gpus);
::rdc::GpuUsageInfo* sinfo = reply->mutable_summary();
copy_gpu_usage_info(job_info.summary, sinfo);
@@ -462,8 +467,6 @@ RdcAPIServiceImpl::~RdcAPIServiceImpl() {
copy_gpu_usage_info(job_info.gpus[i], ginfo);
}
reply->set_status(result);
return ::grpc::Status::OK;
}
@@ -478,6 +481,8 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src,
target->set_end_time(src.end_time);
target->set_energy_consumed(src.energy_consumed);
target->set_max_gpu_memory_used(src.max_gpu_memory_used);
target->set_ecc_correct(src.ecc_correct);
target->set_ecc_uncorrect(src.ecc_uncorrect);
::rdc::JobStatsSummary* stats = target->mutable_power_usage();
stats->set_max_value(src.power_usage.max_value);
@@ -499,6 +504,25 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src,
stats->set_min_value(src.memory_utilization.min_value);
stats->set_average(src.memory_utilization.average);
stats = target->mutable_pcie_tx();
stats->set_max_value(src.pcie_tx.max_value);
stats->set_min_value(src.pcie_tx.min_value);
stats->set_average(src.pcie_tx.average);
stats = target->mutable_pcie_rx();
stats->set_max_value(src.pcie_rx.max_value);
stats->set_min_value(src.pcie_rx.min_value);
stats->set_average(src.pcie_rx.average);
stats = target->mutable_memory_clock();
stats->set_max_value(src.memory_clock.max_value);
stats->set_min_value(src.memory_clock.min_value);
stats->set_average(src.memory_clock.average);
stats = target->mutable_gpu_temperature();
stats->set_max_value(src.gpu_temperature.max_value);
stats->set_min_value(src.gpu_temperature.min_value);
stats->set_average(src.gpu_temperature.average);
return true;
}