Support standard deviation and json output for job stats
In the job stats, in addition to the max, min and average,
it will also display the standard deviation.
A new option --json is added to the rdci to output the results
in json format.
In the job stats, using the GMT time instead of timestamp
for start and end time.
Change-Id: If245c4fc4854a1dc867f97ff5aa9112af7962eca
[ROCm/rdc commit: e6d910f67a]
This commit is contained in:
committad av
Chris Freehill
förälder
bf248131cb
incheckning
588ea96dd2
@@ -233,6 +233,7 @@ typedef struct {
|
||||
uint64_t max_value; //!< Maximum value measured
|
||||
uint64_t min_value; //!< Minimum value measured
|
||||
uint64_t average; //!< Average value measured
|
||||
double standard_deviation; //!< The standard deviation
|
||||
} rdc_stats_summary_t;
|
||||
|
||||
/**
|
||||
|
||||
@@ -46,6 +46,15 @@ struct FieldSummaryStats {
|
||||
int64_t max_value;
|
||||
int64_t min_value;
|
||||
int64_t total_value;
|
||||
|
||||
// Use Welford algorithm to calculate the standard deviations.
|
||||
// https://en.wikipedia.org/wiki/Standard_deviation#Rapid_calculation_methods
|
||||
// https://www.johndcook.com/blog/standard_deviation/
|
||||
double old_m;
|
||||
double old_s;
|
||||
double new_m;
|
||||
double new_s;
|
||||
|
||||
uint64_t last_time;
|
||||
uint64_t count;
|
||||
};
|
||||
@@ -100,6 +109,8 @@ class RdcCacheManagerImpl: public RdcCacheManager {
|
||||
void set_summary(const FieldSummaryStats & stats,
|
||||
rdc_stats_summary_t& gpu, rdc_stats_summary_t& summary, // NOLINT
|
||||
unsigned int adjuster);
|
||||
void set_average_summary(
|
||||
rdc_stats_summary_t& summary, uint32_t num_gpus); // NOLINT
|
||||
RdcCacheSamples cache_samples_;
|
||||
RdcJobStatsCache cache_jobs_;
|
||||
std::mutex cache_mutex_;
|
||||
|
||||
@@ -414,7 +414,7 @@ message JobStatsSummary {
|
||||
uint64 max_value = 1;
|
||||
uint64 min_value = 2;
|
||||
uint64 average = 3;
|
||||
|
||||
double standard_deviation = 4;
|
||||
}
|
||||
|
||||
message GpuUsageInfo {
|
||||
|
||||
@@ -160,7 +160,7 @@ message("RDC_LIB_INC_LIST=${RDC_LIB_INC_LIST}")
|
||||
|
||||
link_directories(${RSMI_LIB_DIR} "${GRPC_ROOT}/lib" "${GRPC_ROOT}/lib64")
|
||||
add_library(${RDC_LIB} SHARED ${RDC_LIB_SRC_LIST} ${RDC_LIB_INC_LIST})
|
||||
target_link_libraries(${RDC_LIB} pthread rocm_smi64)
|
||||
target_link_libraries(${RDC_LIB} ${BOOTSTRAP_LIB} pthread rocm_smi64)
|
||||
target_include_directories(${RDC_LIB} PRIVATE
|
||||
"${PROJECT_SOURCE_DIR}"
|
||||
"${PROJECT_SOURCE_DIR}/include"
|
||||
@@ -191,7 +191,7 @@ set(RDCCLIENT_LIB_INC_LIST ${RDCCLIENT_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib
|
||||
message("RDCCLIENT_LIB_INC_LIST=${RDCCLIENT_LIB_INC_LIST}")
|
||||
|
||||
add_library(${RDCCLIENT_LIB} SHARED ${RDCCLIENT_LIB_SRC_LIST} ${RDCCLIENT_LIB_INC_LIST})
|
||||
target_link_libraries(${RDCCLIENT_LIB} pthread rt grpc grpc++ grpc++_reflection
|
||||
target_link_libraries(${RDCCLIENT_LIB} ${BOOTSTRAP_LIB} pthread rt grpc grpc++ grpc++_reflection
|
||||
dl protobuf)
|
||||
target_include_directories(${RDCCLIENT_LIB} PRIVATE
|
||||
"${GRPC_ROOT}/include"
|
||||
|
||||
@@ -21,6 +21,7 @@ THE SOFTWARE.
|
||||
*/
|
||||
#include "rdc_lib/impl/RdcCacheManagerImpl.h"
|
||||
#include <sys/time.h>
|
||||
#include <cmath>
|
||||
#include <ctime>
|
||||
#include <sstream>
|
||||
#include "rdc_lib/RdcLogger.h"
|
||||
@@ -213,6 +214,10 @@ rdc_status_t RdcCacheManagerImpl::rdc_update_job_stats(uint32_t gpu_index,
|
||||
if (value.field_id == RDC_FI_POWER_USAGE) {
|
||||
gpu_iter->second.energy_last_time = value.ts;
|
||||
}
|
||||
|
||||
// https://www.johndcook.com/blog/standard_deviation/
|
||||
fsummary->second.old_s = 0;
|
||||
fsummary->second.old_m = fsummary->second.new_m = value.value.l_int;
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
if (value.field_id == RDC_FI_POWER_USAGE) {
|
||||
@@ -229,6 +234,15 @@ rdc_status_t RdcCacheManagerImpl::rdc_update_job_stats(uint32_t gpu_index,
|
||||
fsummary->second.last_time = value.ts;
|
||||
fsummary->second.count++;
|
||||
|
||||
// https://www.johndcook.com/blog/standard_deviation/
|
||||
fsummary->second.new_m = fsummary->second.old_m +
|
||||
(value.value.l_int - fsummary->second.old_m)/fsummary->second.count;
|
||||
fsummary->second.new_s = fsummary->second.old_s +
|
||||
(value.value.l_int - fsummary->second.old_m)*
|
||||
(value.value.l_int - fsummary->second.new_m);
|
||||
fsummary->second.old_m = fsummary->second.new_m;
|
||||
fsummary->second.old_s = fsummary->second.new_s;
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
@@ -248,6 +262,11 @@ void RdcCacheManagerImpl::set_summary(const FieldSummaryStats & stats,
|
||||
summary.min_value = std::min(summary.min_value, gpu.min_value);
|
||||
//< save total for future average calculation.
|
||||
summary.average += gpu.average;
|
||||
|
||||
//< calculate the sample variance
|
||||
gpu.standard_deviation = std::sqrt((stats.count > 1)
|
||||
? stats.new_s/(stats.count - 1) : 0.0)/adjuster;
|
||||
summary.standard_deviation += gpu.standard_deviation;
|
||||
}
|
||||
|
||||
rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(const char jobId[64],
|
||||
@@ -274,15 +293,17 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(const char jobId[64],
|
||||
summary_info.max_gpu_memory_used = 0;
|
||||
summary_info.ecc_correct = 0;
|
||||
summary_info.ecc_uncorrect = 0;
|
||||
summary_info.power_usage = {0, std::numeric_limits<uint64_t>::max(), 0};
|
||||
summary_info.pcie_tx = {0, std::numeric_limits<uint64_t>::max(), 0};
|
||||
summary_info.pcie_rx = {0, std::numeric_limits<uint64_t>::max(), 0};
|
||||
summary_info.gpu_temperature = {0, std::numeric_limits<uint64_t>::max(), 0};
|
||||
summary_info.memory_clock = {0, std::numeric_limits<uint64_t>::max(), 0};
|
||||
summary_info.gpu_clock = {0, std::numeric_limits<uint64_t>::max(), 0};
|
||||
summary_info.gpu_utilization = {0, std::numeric_limits<uint64_t>::max(), 0};
|
||||
summary_info.power_usage = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
|
||||
summary_info.pcie_tx = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
|
||||
summary_info.pcie_rx = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
|
||||
summary_info.gpu_temperature =
|
||||
{0, std::numeric_limits<uint64_t>::max(), 0, 0};
|
||||
summary_info.memory_clock = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
|
||||
summary_info.gpu_clock = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
|
||||
summary_info.gpu_utilization =
|
||||
{0, std::numeric_limits<uint64_t>::max(), 0, 0};
|
||||
summary_info.memory_utilization = {0,
|
||||
std::numeric_limits<uint64_t>::max(), 0};
|
||||
std::numeric_limits<uint64_t>::max(), 0, 0};
|
||||
|
||||
p_job_info->num_gpus = job_stats->second.gpu_stats.size();
|
||||
|
||||
@@ -363,27 +384,25 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(const char jobId[64],
|
||||
}
|
||||
}
|
||||
}
|
||||
// Get the average of the summary
|
||||
summary_info.power_usage.average = summary_info.power_usage.average/
|
||||
p_job_info->num_gpus;
|
||||
summary_info.gpu_clock.average = summary_info.gpu_clock.average/
|
||||
p_job_info->num_gpus;
|
||||
summary_info.gpu_utilization.average = summary_info.gpu_utilization.average/
|
||||
p_job_info->num_gpus;
|
||||
summary_info.memory_utilization.average =
|
||||
summary_info.memory_utilization.average/p_job_info->num_gpus;
|
||||
summary_info.pcie_tx.average = summary_info.pcie_tx.average/
|
||||
p_job_info->num_gpus;
|
||||
summary_info.pcie_rx.average = summary_info.pcie_rx.average/
|
||||
p_job_info->num_gpus;
|
||||
summary_info.gpu_temperature.average = summary_info.gpu_temperature.average/
|
||||
p_job_info->num_gpus;
|
||||
summary_info.memory_clock.average = summary_info.memory_clock.average/
|
||||
p_job_info->num_gpus;
|
||||
// Set the average of the summary
|
||||
set_average_summary(summary_info.power_usage, p_job_info->num_gpus);
|
||||
set_average_summary(summary_info.gpu_clock, p_job_info->num_gpus);
|
||||
set_average_summary(summary_info.gpu_utilization, p_job_info->num_gpus);
|
||||
set_average_summary(summary_info.memory_utilization, p_job_info->num_gpus);
|
||||
set_average_summary(summary_info.pcie_tx, p_job_info->num_gpus);
|
||||
set_average_summary(summary_info.pcie_rx, p_job_info->num_gpus);
|
||||
set_average_summary(summary_info.gpu_temperature, p_job_info->num_gpus);
|
||||
set_average_summary(summary_info.memory_clock, p_job_info->num_gpus);
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
void RdcCacheManagerImpl::set_average_summary(
|
||||
rdc_stats_summary_t& summary, uint32_t num_gpus) {
|
||||
summary.average = summary.average/num_gpus;
|
||||
summary.standard_deviation = summary.standard_deviation/num_gpus;
|
||||
}
|
||||
|
||||
rdc_status_t RdcCacheManagerImpl::rdc_job_start_stats(const char job_id[64],
|
||||
const rdc_group_info_t& ginfo, const rdc_field_group_info_t& finfo,
|
||||
const rdc_gpu_gauges_t& gpu_gauges) {
|
||||
|
||||
@@ -42,6 +42,7 @@ RdcGroupSettingsImpl::RdcGroupSettingsImpl() {
|
||||
|
||||
rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_create(
|
||||
const char* group_name, rdc_gpu_group_t* p_rdc_group_id) {
|
||||
RDC_LOG(RDC_DEBUG, "Create group " << group_name);
|
||||
rdc_group_info_t ginfo;
|
||||
strncpy_with_null(ginfo.group_name, group_name, RDC_MAX_STR_LENGTH);
|
||||
ginfo.count = 0;
|
||||
@@ -135,6 +136,7 @@ rdc_status_t RdcGroupSettingsImpl::rdc_group_field_create(
|
||||
uint32_t num_field_ids, uint32_t* field_ids,
|
||||
const char* field_group_name, rdc_field_grp_t* rdc_field_group_id) {
|
||||
|
||||
RDC_LOG(RDC_DEBUG, "Create field group " << field_group_name);
|
||||
rdc_field_group_info_t finfo;
|
||||
finfo.count = num_field_ids;
|
||||
strncpy_with_null(finfo.group_name, field_group_name, RDC_MAX_STR_LENGTH);
|
||||
|
||||
@@ -95,41 +95,49 @@ bool RdcStandaloneHandler::copy_gpu_usage_info(
|
||||
target->power_usage.max_value = pstats.max_value();
|
||||
target->power_usage.min_value = pstats.min_value();
|
||||
target->power_usage.average = pstats.average();
|
||||
target->power_usage.standard_deviation = pstats.standard_deviation();
|
||||
|
||||
const ::rdc::JobStatsSummary& cstats = src.gpu_clock();
|
||||
target->gpu_clock.max_value = cstats.max_value();
|
||||
target->gpu_clock.min_value = cstats.min_value();
|
||||
target->gpu_clock.average = cstats.average();
|
||||
target->gpu_clock.standard_deviation = cstats.standard_deviation();
|
||||
|
||||
const ::rdc::JobStatsSummary& ustats = src.gpu_utilization();
|
||||
target->gpu_utilization.max_value = ustats.max_value();
|
||||
target->gpu_utilization.min_value = ustats.min_value();
|
||||
target->gpu_utilization.average = ustats.average();
|
||||
target->gpu_utilization.standard_deviation = ustats.standard_deviation();
|
||||
|
||||
const ::rdc::JobStatsSummary& mstats = src.memory_utilization();
|
||||
target->memory_utilization.max_value = mstats.max_value();
|
||||
target->memory_utilization.min_value = mstats.min_value();
|
||||
target->memory_utilization.average = mstats.average();
|
||||
target->memory_utilization.standard_deviation = mstats.standard_deviation();
|
||||
|
||||
const ::rdc::JobStatsSummary& txstats = src.pcie_tx();
|
||||
target->pcie_tx.max_value = txstats.max_value();
|
||||
target->pcie_tx.min_value = txstats.min_value();
|
||||
target->pcie_tx.average = txstats.average();
|
||||
target->pcie_tx.standard_deviation = txstats.standard_deviation();
|
||||
|
||||
const ::rdc::JobStatsSummary& rxstats = src.pcie_rx();
|
||||
target->pcie_rx.max_value = rxstats.max_value();
|
||||
target->pcie_rx.min_value = rxstats.min_value();
|
||||
target->pcie_rx.average = rxstats.average();
|
||||
target->pcie_rx.standard_deviation = rxstats.standard_deviation();
|
||||
|
||||
const ::rdc::JobStatsSummary& mcstats = src.memory_clock();
|
||||
target->memory_clock.max_value = mcstats.max_value();
|
||||
target->memory_clock.min_value = mcstats.min_value();
|
||||
target->memory_clock.average = mcstats.average();
|
||||
target->memory_clock.standard_deviation = mcstats.standard_deviation();
|
||||
|
||||
const ::rdc::JobStatsSummary& gtstats = src.gpu_temperature();
|
||||
target->gpu_temperature.max_value = gtstats.max_value();
|
||||
target->gpu_temperature.min_value = gtstats.min_value();
|
||||
target->gpu_temperature.average = gtstats.average();
|
||||
target->gpu_temperature.standard_deviation = gtstats.standard_deviation();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -39,6 +39,7 @@ class RdciStatsSubSystem: public RdciSubSystem {
|
||||
private:
|
||||
void show_help() const;
|
||||
void show_job_stats(const rdc_gpu_usage_info_t& gpu_info) const;
|
||||
void show_job_stats_json(const rdc_gpu_usage_info_t& gpu_info) const;
|
||||
|
||||
enum OPERATIONS {
|
||||
STATS_UNKNOWN = 0,
|
||||
|
||||
@@ -39,7 +39,11 @@ class RdciSubSystem {
|
||||
|
||||
virtual void process() = 0;
|
||||
virtual ~RdciSubSystem();
|
||||
|
||||
bool is_json_output() const;
|
||||
|
||||
protected:
|
||||
void set_json_output(bool is_json);
|
||||
std::vector<std::string> split_string(const std::string& s,
|
||||
char delimiter) const;
|
||||
void show_common_usage() const;
|
||||
@@ -52,6 +56,9 @@ class RdciSubSystem {
|
||||
std::string root_ca_;
|
||||
std::string client_cert_;
|
||||
std::string client_key_;
|
||||
|
||||
private:
|
||||
bool is_json_output_;
|
||||
};
|
||||
|
||||
typedef std::shared_ptr<RdciSubSystem> RdciSubSystemPtr;
|
||||
|
||||
@@ -35,11 +35,13 @@ RdciDiscoverySubSystem::RdciDiscoverySubSystem() : show_help_(false) {
|
||||
|
||||
void RdciDiscoverySubSystem::parse_cmd_opts(int argc, char ** argv) {
|
||||
const int HOST_OPTIONS = 1000;
|
||||
const int JSON_OPTIONS = 1001;
|
||||
const struct option long_options[] = {
|
||||
{"host", required_argument, nullptr, HOST_OPTIONS },
|
||||
{"help", optional_argument, nullptr, 'h' },
|
||||
{"unauth", optional_argument, nullptr, 'u' },
|
||||
{"list", optional_argument, nullptr, 'l' },
|
||||
{"json", optional_argument, nullptr, JSON_OPTIONS },
|
||||
{ nullptr, 0 , nullptr, 0 }
|
||||
};
|
||||
|
||||
@@ -53,6 +55,9 @@ void RdciDiscoverySubSystem::parse_cmd_opts(int argc, char ** argv) {
|
||||
case HOST_OPTIONS:
|
||||
ip_port_ = optarg;
|
||||
break;
|
||||
case JSON_OPTIONS:
|
||||
set_json_output(true);
|
||||
break;
|
||||
case 'h':
|
||||
show_help_ = true;
|
||||
return;
|
||||
@@ -77,12 +82,16 @@ void RdciDiscoverySubSystem::parse_cmd_opts(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
void RdciDiscoverySubSystem::show_help() const {
|
||||
if (is_json_output()) return;
|
||||
std::cout << " discovery -- Used to discover and identify GPUs "
|
||||
<< "and their attributes.\n\n";
|
||||
std::cout << "Usage\n";
|
||||
std::cout << " rdci discovery [--host <IP/FQDN>:port] [-u] -l\n";
|
||||
std::cout << " rdci discovery [--host <IP/FQDN>:port] [--json]"
|
||||
<< " [-u] -l\n";
|
||||
std::cout << "\nFlags:\n";
|
||||
show_common_usage();
|
||||
std::cout << " --json "
|
||||
<< "Output using json.\n";
|
||||
std::cout << " -l --list list GPU discovered"
|
||||
<<" on the system\n";
|
||||
}
|
||||
@@ -101,14 +110,22 @@ void RdciDiscoverySubSystem::process() {
|
||||
throw RdcException(result, "Fail to get device information");
|
||||
}
|
||||
if (count == 0) {
|
||||
std::cout << "No GPUs find on the sytem\n";
|
||||
if (is_json_output()) {
|
||||
std::cout << "\"gpus\" : [], \"status\": \"ok\"";
|
||||
} else {
|
||||
std::cout << "No GPUs find on the system\n";
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
std::cout << count << " GPUs found.\n";
|
||||
std::cout << "------------------------------------------------"
|
||||
<< "-----------------\n";
|
||||
std::cout << "GPU Index\t Device Information\n";
|
||||
if (is_json_output()) {
|
||||
std::cout << "\"gpus\" : [";
|
||||
} else {
|
||||
std::cout << count << " GPUs found.\n";
|
||||
std::cout << "------------------------------------------------"
|
||||
<< "-----------------\n";
|
||||
std::cout << "GPU Index\t Device Information\n";
|
||||
}
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
rdc_device_attributes_t attribute;
|
||||
result = rdc_device_get_attributes(rdc_handle_,
|
||||
@@ -116,10 +133,22 @@ void RdciDiscoverySubSystem::process() {
|
||||
if (result != RDC_ST_OK) {
|
||||
return;
|
||||
}
|
||||
std::cout << i << "\t\t" << attribute.device_name <<std::endl;
|
||||
if (is_json_output()) {
|
||||
std::cout << "{\"gpu_index\": \"" << i << "\", \"device_name\": \""
|
||||
<< attribute.device_name << "\"}";
|
||||
if (i != count -1) {
|
||||
std::cout << ",";
|
||||
}
|
||||
} else {
|
||||
std::cout << i << "\t\t" << attribute.device_name <<std::endl;
|
||||
}
|
||||
}
|
||||
std::cout << "------------------------------------------------"
|
||||
if (is_json_output()) {
|
||||
std::cout << ']';
|
||||
} else {
|
||||
std::cout << "------------------------------------------------"
|
||||
<< "-----------------\n";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -37,6 +37,7 @@ RdciFieldGroupSubSystem::RdciFieldGroupSubSystem():
|
||||
|
||||
void RdciFieldGroupSubSystem::parse_cmd_opts(int argc, char ** argv) {
|
||||
const int HOST_OPTIONS = 1000;
|
||||
const int JSON_OPTIONS = 1001;
|
||||
const struct option long_options[] = {
|
||||
{"host", required_argument, nullptr, HOST_OPTIONS },
|
||||
{"help", optional_argument, nullptr, 'h' },
|
||||
@@ -47,6 +48,7 @@ void RdciFieldGroupSubSystem::parse_cmd_opts(int argc, char ** argv) {
|
||||
{"fieldids", required_argument, nullptr, 'f'},
|
||||
{"info", optional_argument, nullptr, 'i' },
|
||||
{"delete", required_argument, nullptr, 'd' },
|
||||
{"json", optional_argument, nullptr, JSON_OPTIONS },
|
||||
{ nullptr, 0 , nullptr, 0 }
|
||||
};
|
||||
|
||||
@@ -59,6 +61,9 @@ void RdciFieldGroupSubSystem::parse_cmd_opts(int argc, char ** argv) {
|
||||
case HOST_OPTIONS:
|
||||
ip_port_ = optarg;
|
||||
break;
|
||||
case JSON_OPTIONS:
|
||||
set_json_output(true);
|
||||
break;
|
||||
case 'h':
|
||||
field_group_ops_ = FIELD_GROUP_HELP;
|
||||
return;
|
||||
@@ -112,18 +117,22 @@ void RdciFieldGroupSubSystem::parse_cmd_opts(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
void RdciFieldGroupSubSystem::show_help() const {
|
||||
if (is_json_output()) return;
|
||||
std::cout << " fieldgroup -- Used to create and maintain groups "
|
||||
<< "of field Ids.\n\n";
|
||||
std::cout << "Usage\n";
|
||||
std::cout << " rdci fieldgroup [--host <IP/FQDN>:port] [-u] -l\n";
|
||||
std::cout << " rdci fieldgroup [--host <IP/FQDN>:port] [-u] "
|
||||
<< "-c <groupName> -f <filedIds>\n";
|
||||
std::cout << " rdci fieldgroup [--host <IP/FQDN>:port] [-u] "
|
||||
std::cout << " rdci fieldgroup [--host <IP/FQDN>:port]"
|
||||
<< " [--json] [-u] -l\n";
|
||||
std::cout << " rdci fieldgroup [--host <IP/FQDN>:port] [--json]"
|
||||
<< " [-u] -c <groupName> -f <filedIds>\n";
|
||||
std::cout << " rdci fieldgroup [--host <IP/FQDN>:port] [--json] [-u] "
|
||||
<< "-g <groupId> -i\n";
|
||||
std::cout << " rdci fieldgroup [--host <IP/FQDN>:port] [-u] "
|
||||
std::cout << " rdci fieldgroup [--host <IP/FQDN>:port] [--json] [-u] "
|
||||
<< "-d <groupId>\n";
|
||||
std::cout << "\nFlags:\n";
|
||||
show_common_usage();
|
||||
std::cout << " --json "
|
||||
<< "Output using json.\n";
|
||||
std::cout << " -l --list "
|
||||
<< "List the field groups that currently exist for a host.\n";
|
||||
std::cout << " -g --group groupId "
|
||||
@@ -143,6 +152,7 @@ void RdciFieldGroupSubSystem::process() {
|
||||
rdc_status_t result = RDC_ST_OK;
|
||||
rdc_field_group_info_t group_info;
|
||||
uint32_t count = 0;
|
||||
std::string json_group_ids = "\"field_groups\": [";
|
||||
switch (field_group_ops_) {
|
||||
case FIELD_GROUP_HELP:
|
||||
show_help();
|
||||
@@ -170,9 +180,14 @@ void RdciFieldGroupSubSystem::process() {
|
||||
result = rdc_group_field_create(rdc_handle_, fields.size(),
|
||||
&field_ids[0], group_name_.c_str(), &group_id);
|
||||
if (result == RDC_ST_OK) {
|
||||
std::cout << "Successfully created a field group"
|
||||
<< " with a group ID " << group_id << std::endl;
|
||||
return;
|
||||
if (is_json_output()) {
|
||||
std::cout << "\"field_group_id\": \"" << group_id
|
||||
<<"\", \"status\": \"ok\"";
|
||||
} else {
|
||||
std::cout << "Successfully created a field group"
|
||||
<< " with a group ID " << group_id << std::endl;
|
||||
return;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -184,8 +199,13 @@ void RdciFieldGroupSubSystem::process() {
|
||||
}
|
||||
result = rdc_group_field_destroy(rdc_handle_, group_id_);
|
||||
if (result == RDC_ST_OK) {
|
||||
std::cout << "Successfully deleted the field group "
|
||||
if (is_json_output()) {
|
||||
std::cout << "\"field_group_id\": \"" << group_id_
|
||||
<<"\", \"status\": \"ok\"";
|
||||
} else {
|
||||
std::cout << "Successfully deleted the field group "
|
||||
<< group_id_ << std::endl;
|
||||
}
|
||||
return;
|
||||
}
|
||||
break;
|
||||
@@ -195,8 +215,11 @@ void RdciFieldGroupSubSystem::process() {
|
||||
rdc_handle_, group_id_list, &count);
|
||||
if ( result != RDC_ST_OK) break;
|
||||
|
||||
std::cout << count << " field group found.\n";
|
||||
std::cout << "GroupID\t" << "GroupName\t" << "FieldIds\n";
|
||||
if (!is_json_output()) {
|
||||
std::cout << count << " field group found.\n";
|
||||
std::cout << "GroupID\t" << "GroupName\t" << "FieldIds\n";
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
result = rdc_group_field_get_info(
|
||||
rdc_handle_, group_id_list[i], &group_info);
|
||||
@@ -206,15 +229,44 @@ void RdciFieldGroupSubSystem::process() {
|
||||
std::to_string(group_id_list[i]));
|
||||
}
|
||||
|
||||
std::cout << group_id_list[i] << "\t"
|
||||
if (!is_json_output()) {
|
||||
std::cout << group_id_list[i] << "\t"
|
||||
<< group_info.group_name << "\t\t";
|
||||
} else {
|
||||
json_group_ids += "{\"group_id\": \"";
|
||||
json_group_ids += std::to_string(group_id_list[i]);
|
||||
json_group_ids += "\", \"group_name\": \"";
|
||||
json_group_ids += group_info.group_name;
|
||||
json_group_ids += "\", \"field_ids\": [";
|
||||
}
|
||||
|
||||
for (uint32_t j = 0; j < group_info.count; j++) {
|
||||
std::cout << group_info.field_ids[j];
|
||||
if (!is_json_output()) {
|
||||
std::cout << group_info.field_ids[j];
|
||||
} else {
|
||||
json_group_ids +=
|
||||
std::to_string(group_info.field_ids[j]);
|
||||
}
|
||||
if ( j < group_info.count -1 ) {
|
||||
std::cout << ",";
|
||||
if (!is_json_output()) {
|
||||
std::cout << ",";
|
||||
} else {
|
||||
json_group_ids += ",";
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cout << std::endl;
|
||||
if (!is_json_output()) {
|
||||
std::cout << std::endl;
|
||||
} else {
|
||||
json_group_ids += "]}";
|
||||
if (i != count -1) {
|
||||
json_group_ids += ",";
|
||||
}
|
||||
}
|
||||
}
|
||||
if (is_json_output()) {
|
||||
json_group_ids += "], \"status\": \"ok\"";
|
||||
std::cout << json_group_ids;
|
||||
}
|
||||
break;
|
||||
case FIELD_GROUP_INFO:
|
||||
@@ -226,13 +278,29 @@ void RdciFieldGroupSubSystem::process() {
|
||||
result = rdc_group_field_get_info(
|
||||
rdc_handle_, group_id_, &group_info);
|
||||
if (result == RDC_ST_OK) {
|
||||
std::cout << "Group name: " << group_info.group_name
|
||||
<< std::endl;
|
||||
std::cout << "Field Ids: ";
|
||||
for (uint32_t i = 0; i < group_info.count; i++) {
|
||||
std::cout << group_info.field_ids[i] << " ";
|
||||
if (is_json_output()) {
|
||||
std::cout << "\"group_name\": \"" << group_info.group_name
|
||||
<< "\", \"field_ids\": [";
|
||||
} else {
|
||||
std::cout << "Group name: " << group_info.group_name
|
||||
<< std::endl;
|
||||
std::cout << "Field Ids: ";
|
||||
}
|
||||
for (uint32_t i = 0; i < group_info.count; i++) {
|
||||
if (is_json_output()) {
|
||||
std::cout << group_info.field_ids[i];
|
||||
if ( i != group_info.count-1 ) {
|
||||
std::cout << ",";
|
||||
}
|
||||
} else {
|
||||
std::cout << group_info.field_ids[i] << " ";
|
||||
}
|
||||
}
|
||||
if (is_json_output()) {
|
||||
std::cout << "], \"status\": \"ok\"";
|
||||
} else {
|
||||
std::cout << std::endl;
|
||||
}
|
||||
std::cout << std::endl;
|
||||
return;
|
||||
}
|
||||
break;
|
||||
|
||||
@@ -37,6 +37,7 @@ RdciGroupSubSystem::RdciGroupSubSystem():
|
||||
|
||||
void RdciGroupSubSystem::parse_cmd_opts(int argc, char ** argv) {
|
||||
const int HOST_OPTIONS = 1000;
|
||||
const int JSON_OPTIONS = 1001;
|
||||
const struct option long_options[] = {
|
||||
{"host", required_argument, nullptr, HOST_OPTIONS },
|
||||
{"help", optional_argument, nullptr, 'h' },
|
||||
@@ -47,6 +48,7 @@ void RdciGroupSubSystem::parse_cmd_opts(int argc, char ** argv) {
|
||||
{"add", required_argument, nullptr, 'a' },
|
||||
{"info", optional_argument, nullptr, 'i' },
|
||||
{"delete", required_argument, nullptr, 'd' },
|
||||
{"json", optional_argument, nullptr, JSON_OPTIONS },
|
||||
{ nullptr, 0 , nullptr, 0 }
|
||||
};
|
||||
|
||||
@@ -59,6 +61,9 @@ void RdciGroupSubSystem::parse_cmd_opts(int argc, char ** argv) {
|
||||
case HOST_OPTIONS:
|
||||
ip_port_ = optarg;
|
||||
break;
|
||||
case JSON_OPTIONS:
|
||||
set_json_output(true);
|
||||
break;
|
||||
case 'h':
|
||||
group_ops_ = GROUP_HELP;
|
||||
return;
|
||||
@@ -116,18 +121,22 @@ void RdciGroupSubSystem::parse_cmd_opts(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
void RdciGroupSubSystem::show_help() const {
|
||||
if (is_json_output()) return;
|
||||
std::cout << " group -- Used to create and maintain groups of GPUs.\n\n";
|
||||
std::cout << "Usage\n";
|
||||
std::cout << " rdci group [--host <IP/FQDN>:port] [-u] -l\n";
|
||||
std::cout << " rdci group [--host <IP/FQDN>:port] [-u] -c <groupName> "
|
||||
<< "[-a <entityId>]\n";
|
||||
std::cout << " rdci group [--host <IP/FQDN>:port] [-u] -g <groupId> "
|
||||
<< "[-a <entityId>]\n";
|
||||
std::cout << " rdci group [--host <IP/FQDN>:port] [-u] "
|
||||
std::cout << " rdci group [--host <IP/FQDN>:port] [--json] [-u] -l\n";
|
||||
std::cout << " rdci group [--host <IP/FQDN>:port] [--json] [-u]"
|
||||
<< " -c <groupName> [-a <entityId>]\n";
|
||||
std::cout << " rdci group [--host <IP/FQDN>:port] [--json] [-u]"
|
||||
<< " -g <groupId> [-a <entityId>]\n";
|
||||
std::cout << " rdci group [--host <IP/FQDN>:port] [--json] [-u] "
|
||||
<< "-g <groupId> [-i]\n";
|
||||
std::cout << " rdci group [--host <IP/FQDN>:port] [-u] -d <groupId>\n";
|
||||
std::cout << " rdci group [--host <IP/FQDN>:port] [--json] [-u] "
|
||||
<< "-d <groupId>\n";
|
||||
std::cout << "\nFlags:\n";
|
||||
show_common_usage();
|
||||
std::cout << " --json "
|
||||
<< "Output using json.\n";
|
||||
std::cout << " -l --list "
|
||||
<< "List the groups that currently exist for a host.\n";
|
||||
std::cout << " -g --group groupId "
|
||||
@@ -148,6 +157,7 @@ void RdciGroupSubSystem::process() {
|
||||
std::vector<std::string> gpu_ids;
|
||||
rdc_group_info_t group_info;
|
||||
uint32_t count = 0;
|
||||
std::string json_group_ids = "\"gpu_groups\": [";
|
||||
switch (group_ops_) {
|
||||
case GROUP_HELP:
|
||||
show_help();
|
||||
@@ -181,8 +191,13 @@ void RdciGroupSubSystem::process() {
|
||||
}
|
||||
|
||||
if (result == RDC_ST_OK) {
|
||||
std::cout << "Successfully created group with a group ID "
|
||||
<< group_id << std::endl;
|
||||
if (is_json_output()) {
|
||||
std::cout << "\"group_id\": \"" << group_id
|
||||
<<"\", \"status\": \"ok\"";
|
||||
} else {
|
||||
std::cout << "Successfully created group with a group ID "
|
||||
<< group_id << std::endl;
|
||||
}
|
||||
return;
|
||||
}
|
||||
break;
|
||||
@@ -194,8 +209,13 @@ void RdciGroupSubSystem::process() {
|
||||
}
|
||||
result = rdc_group_gpu_destroy(rdc_handle_, group_id_);
|
||||
if (result == RDC_ST_OK) {
|
||||
std::cout << "Successfully deleted the group "
|
||||
<< group_id_ << std::endl;
|
||||
if (is_json_output()) {
|
||||
std::cout << "\"group_id\": \"" << group_id_
|
||||
<<"\", \"status\": \"ok\"";
|
||||
} else {
|
||||
std::cout << "Successfully deleted the group "
|
||||
<< group_id_ << std::endl;
|
||||
}
|
||||
return;
|
||||
}
|
||||
break;
|
||||
@@ -204,8 +224,10 @@ void RdciGroupSubSystem::process() {
|
||||
result = rdc_group_get_all_ids(rdc_handle_, group_id_list, &count);
|
||||
if ( result != RDC_ST_OK) break;
|
||||
|
||||
std::cout << count << " group found.\n";
|
||||
std::cout << "GroupID\t" << "GroupName\t" << "GPUIndex\n";
|
||||
if (!is_json_output()) {
|
||||
std::cout << count << " group found.\n";
|
||||
std::cout << "GroupID\t" << "GroupName\t" << "GPUIndex\n";
|
||||
}
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
result = rdc_group_gpu_get_info(rdc_handle_,
|
||||
group_id_list[i], &group_info);
|
||||
@@ -215,15 +237,43 @@ void RdciGroupSubSystem::process() {
|
||||
+ std::to_string(group_id_list[i]));
|
||||
}
|
||||
|
||||
std::cout << group_id_list[i] << "\t"
|
||||
<< group_info.group_name << "\t\t";
|
||||
if (!is_json_output()) {
|
||||
std::cout << group_id_list[i] << "\t"
|
||||
<< group_info.group_name << "\t\t";
|
||||
} else {
|
||||
json_group_ids += "{\"group_id\": \"";
|
||||
json_group_ids += std::to_string(group_id_list[i]);
|
||||
json_group_ids += "\", \"group_name\": \"";
|
||||
json_group_ids += group_info.group_name;
|
||||
json_group_ids += "\", \"gpu_indexes\": [";
|
||||
}
|
||||
for (uint32_t j = 0; j < group_info.count; j++) {
|
||||
std::cout << group_info.entity_ids[j];
|
||||
if (!is_json_output()) {
|
||||
std::cout << group_info.entity_ids[j];
|
||||
} else {
|
||||
json_group_ids +=
|
||||
std::to_string(group_info.entity_ids[j]);
|
||||
}
|
||||
if (j < group_info.count -1) {
|
||||
std::cout << ",";
|
||||
if (!is_json_output()) {
|
||||
std::cout << ",";
|
||||
} else {
|
||||
json_group_ids += ",";
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cout << std::endl;
|
||||
if (!is_json_output()) {
|
||||
std::cout << std::endl;
|
||||
} else {
|
||||
json_group_ids += "]}";
|
||||
if (i != count -1) {
|
||||
json_group_ids += ",";
|
||||
}
|
||||
}
|
||||
}
|
||||
if (is_json_output()) {
|
||||
json_group_ids += "], \"status\": \"ok\"";
|
||||
std::cout << json_group_ids;
|
||||
}
|
||||
break;
|
||||
case GROUP_ADD_GPUS:
|
||||
@@ -247,8 +297,13 @@ void RdciGroupSubSystem::process() {
|
||||
}
|
||||
}
|
||||
if (result == RDC_ST_OK) {
|
||||
std::cout << "Successfully added the GPU " << gpu_ids_
|
||||
<< " to group "<< group_id_ << std::endl;
|
||||
if (is_json_output()) {
|
||||
std::cout << "\"group_id\": \"" << group_id_
|
||||
<<"\", \"status\": \"ok\"";
|
||||
} else {
|
||||
std::cout << "Successfully added the GPU " << gpu_ids_
|
||||
<< " to group "<< group_id_ << std::endl;
|
||||
}
|
||||
return;
|
||||
}
|
||||
break;
|
||||
@@ -261,13 +316,29 @@ void RdciGroupSubSystem::process() {
|
||||
result = rdc_group_gpu_get_info(rdc_handle_,
|
||||
group_id_, &group_info);
|
||||
if (result == RDC_ST_OK) {
|
||||
std::cout << "Group name: "
|
||||
<< group_info.group_name << std::endl;
|
||||
std::cout << "Gpu indexes: ";
|
||||
for (uint32_t i = 0; i < group_info.count; i++) {
|
||||
std::cout << group_info.entity_ids[i] << " ";
|
||||
if (is_json_output()) {
|
||||
std::cout << "\"group_name\": \"" << group_info.group_name
|
||||
<< "\", \"gpu_indexes\": [";
|
||||
} else {
|
||||
std::cout << "Group name: "
|
||||
<< group_info.group_name << std::endl;
|
||||
std::cout << "Gpu indexes: ";
|
||||
}
|
||||
for (uint32_t i = 0; i < group_info.count; i++) {
|
||||
if (is_json_output()) {
|
||||
std::cout << group_info.entity_ids[i];
|
||||
if ( i != group_info.count-1 ) {
|
||||
std::cout << ",";
|
||||
}
|
||||
} else {
|
||||
std::cout << group_info.entity_ids[i] << " ";
|
||||
}
|
||||
}
|
||||
if (is_json_output()) {
|
||||
std::cout << "], \"status\": \"ok\"";
|
||||
} else {
|
||||
std::cout << std::endl;
|
||||
}
|
||||
std::cout << std::endl;
|
||||
return;
|
||||
}
|
||||
break;
|
||||
|
||||
@@ -23,6 +23,7 @@ THE SOFTWARE.
|
||||
#include <getopt.h>
|
||||
#include <unistd.h>
|
||||
#include <signal.h>
|
||||
#include <ctime>
|
||||
#include <limits>
|
||||
#include <iomanip>
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
@@ -42,6 +43,7 @@ RdciStatsSubSystem::~RdciStatsSubSystem() {
|
||||
|
||||
void RdciStatsSubSystem::parse_cmd_opts(int argc, char ** argv) {
|
||||
const int HOST_OPTIONS = 1000;
|
||||
const int JSON_OPTIONS = 1001;
|
||||
const struct option long_options[] = {
|
||||
{"host", required_argument, nullptr, HOST_OPTIONS },
|
||||
{"help", optional_argument, nullptr, 'h' },
|
||||
@@ -53,6 +55,7 @@ void RdciStatsSubSystem::parse_cmd_opts(int argc, char ** argv) {
|
||||
{"jremoveall", optional_argument, nullptr, 'a' },
|
||||
{"verbose", optional_argument, nullptr, 'v'},
|
||||
{"group", required_argument, nullptr, 'g'},
|
||||
{"json", optional_argument, nullptr, JSON_OPTIONS},
|
||||
{ nullptr, 0 , nullptr, 0 }
|
||||
};
|
||||
|
||||
@@ -66,6 +69,9 @@ void RdciStatsSubSystem::parse_cmd_opts(int argc, char ** argv) {
|
||||
case HOST_OPTIONS:
|
||||
ip_port_ = optarg;
|
||||
break;
|
||||
case JSON_OPTIONS:
|
||||
set_json_output(true);
|
||||
break;
|
||||
case 'h':
|
||||
stats_ops_ = STATS_HELP;
|
||||
return;
|
||||
@@ -119,17 +125,22 @@ void RdciStatsSubSystem::parse_cmd_opts(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
void RdciStatsSubSystem::show_help() const {
|
||||
if (is_json_output()) return;
|
||||
std::cout << " stats -- Used to view job statistics.\n\n";
|
||||
std::cout << "Usage\n";
|
||||
std::cout << " rdci stats [--host <IP/FQDN>:port] [-u] -s <jobId>"
|
||||
<< " -g <groupId>\n";
|
||||
std::cout << " rdci stats [--host <IP/FQDN>:port] [-u] -x <jobId>\n";
|
||||
std::cout << " rdci stats [--host <IP/FQDN>:port] [-u] [-v] "
|
||||
std::cout << " rdci stats [--host <IP/FQDN>:port] [-u] [--json] "
|
||||
<< "-s <jobId> -g <groupId>\n";
|
||||
std::cout << " rdci stats [--host <IP/FQDN>:port] [-u] [--json] "
|
||||
<< "-x <jobId>\n";
|
||||
std::cout << " rdci stats [--host <IP/FQDN>:port] [-u] [--json] [-v] "
|
||||
<< "-j <jobId>\n";
|
||||
std::cout << " rdci stats [--host <IP/FQDN>:port] [-u] -r <jobId>\n";
|
||||
std::cout << " rdci stats [--host <IP/FQDN>:port] [-u] -a\n";
|
||||
std::cout << " rdci stats [--host <IP/FQDN>:port] [-u] [--json] "
|
||||
<< "-r <jobId>\n";
|
||||
std::cout << " rdci stats [--host <IP/FQDN>:port] [-u] [--json] -a\n";
|
||||
std::cout << "\nFlags:\n";
|
||||
show_common_usage();
|
||||
std::cout << " --json "
|
||||
<< "Output using json.\n";
|
||||
std::cout << " -s --jstart Start recording "
|
||||
<< "job statistics.\n";
|
||||
std::cout << " -g --group-id The GPU group to query "
|
||||
@@ -146,14 +157,103 @@ void RdciStatsSubSystem::show_help() const {
|
||||
<< "all job statistics.\n";
|
||||
}
|
||||
|
||||
void RdciStatsSubSystem::show_job_stats_json(
|
||||
const rdc_gpu_usage_info_t& gpu_info) const {
|
||||
std::cout << "\"start_time\": " << gpu_info.start_time << ",";
|
||||
std::cout << "\"end_time\": " << gpu_info.end_time << ",";
|
||||
std::cout << "\"execution_time\": " <<
|
||||
(gpu_info.end_time-gpu_info.start_time) << ",";
|
||||
std::cout << "\"energy_consumed\": " << gpu_info.energy_consumed << ",";
|
||||
|
||||
std::cout << "\"power_usage_max\": "
|
||||
<< gpu_info.power_usage.max_value << ",";
|
||||
std::cout << "\"power_usage_min\": "
|
||||
<< gpu_info.power_usage.min_value << ",";
|
||||
std::cout << "\"power_usage_avg\": "
|
||||
<< gpu_info.power_usage.average << ",";
|
||||
std::cout << "\"power_usage_stanard_deviation\": "
|
||||
<< gpu_info.power_usage.standard_deviation << ",";
|
||||
|
||||
std::cout << "\"gpu_clock_max\": "
|
||||
<< gpu_info.gpu_clock.max_value << ",";
|
||||
std::cout << "\"gpu_clock_min\": "
|
||||
<< gpu_info.gpu_clock.min_value << ",";
|
||||
std::cout << "\"gpu_clock_avg\": "
|
||||
<< gpu_info.gpu_clock.average << ",";
|
||||
std::cout << "\"gpu_clock_stanard_deviation\": "
|
||||
<< gpu_info.gpu_clock.standard_deviation << ",";
|
||||
|
||||
std::cout << "\"memory_clock_max\": "
|
||||
<< gpu_info.memory_clock.max_value << ",";
|
||||
std::cout << "\"memory_clock_min\": "
|
||||
<< gpu_info.memory_clock.min_value << ",";
|
||||
std::cout << "\"memory_clock_avg\": "
|
||||
<< gpu_info.memory_clock.average << ",";
|
||||
std::cout << "\"memory_clock_stanard_deviation\": "
|
||||
<< gpu_info.memory_clock.standard_deviation << ",";
|
||||
|
||||
std::cout << "\"gpu_utilization_max\": "
|
||||
<< gpu_info.gpu_utilization.max_value << ",";
|
||||
std::cout << "\"gpu_utilization_min\": "
|
||||
<< gpu_info.gpu_utilization.min_value << ",";
|
||||
std::cout << "\"gpu_utilization_avg\": "
|
||||
<< gpu_info.gpu_utilization.average << ",";
|
||||
std::cout << "\"gpu_utilization_deviation\": "
|
||||
<< gpu_info.gpu_utilization.standard_deviation << ",";
|
||||
|
||||
std::cout << "\"max_gpu_memory_used\": "
|
||||
<< gpu_info.max_gpu_memory_used << ",";
|
||||
|
||||
std::cout << "\"memory_utilization_max\": "
|
||||
<< gpu_info.memory_utilization.max_value << ",";
|
||||
std::cout << "\"memory_utilization_min\": "
|
||||
<< gpu_info.memory_utilization.min_value << ",";
|
||||
std::cout << "\"memory_utilization_avg\": "
|
||||
<< gpu_info.memory_utilization.average << ",";
|
||||
std::cout << "\"memory_utilization_stanard_deviation\": "
|
||||
<< gpu_info.memory_utilization.standard_deviation << ",";
|
||||
|
||||
std::cout << "\"gpu_temperature_max\": "
|
||||
<< gpu_info.gpu_temperature.max_value << ",";
|
||||
std::cout << "\"gpu_temperature_min\": "
|
||||
<< gpu_info.gpu_temperature.min_value << ",";
|
||||
std::cout << "\"gpu_temperature_avg\": "
|
||||
<< gpu_info.gpu_temperature.average << ",";
|
||||
std::cout << "\"gpu_temperature_stanard_deviation\": "
|
||||
<< gpu_info.gpu_temperature.standard_deviation << ",";
|
||||
|
||||
std::cout << "\"pcie_rx_max\": "
|
||||
<< gpu_info.pcie_rx.max_value << ",";
|
||||
std::cout << "\"pcie_rx_min\": "
|
||||
<< gpu_info.pcie_rx.min_value << ",";
|
||||
std::cout << "\"pcie_rx_avg\": "
|
||||
<< gpu_info.pcie_rx.average << ",";
|
||||
std::cout << "\"pcie_rx_stanard_deviation\": "
|
||||
<< gpu_info.pcie_rx.standard_deviation << ",";
|
||||
|
||||
std::cout << "\"pcie_tx_max\": "
|
||||
<< gpu_info.pcie_tx.max_value << ",";
|
||||
std::cout << "\"pcie_tx_min\": "
|
||||
<< gpu_info.pcie_tx.min_value << ",";
|
||||
std::cout << "\"pcie_tx_avg\": "
|
||||
<< gpu_info.pcie_tx.average << ",";
|
||||
std::cout << "\"pcie_tx_stanard_deviation\": "
|
||||
<< gpu_info.pcie_tx.standard_deviation << ",";
|
||||
|
||||
std::cout << "\"ecc_correct\": " << gpu_info.ecc_correct << ",";
|
||||
std::cout << "\"ecc_uncorrect\": " << gpu_info.ecc_uncorrect;
|
||||
}
|
||||
|
||||
void RdciStatsSubSystem::show_job_stats(
|
||||
const rdc_gpu_usage_info_t& gpu_info) const {
|
||||
std::cout << "|------- Execution Stats ----------"
|
||||
<< "+------------------------------------\n";
|
||||
std::cout << "| Start Time | "
|
||||
<< gpu_info.start_time << "\n";
|
||||
<< std::put_time(std::gmtime(reinterpret_cast<const time_t*>
|
||||
(&gpu_info.start_time)), "%c %Z") << "\n";
|
||||
std::cout << "| End Time | "
|
||||
<< gpu_info.end_time << "\n";
|
||||
<< std::put_time(std::gmtime(reinterpret_cast<const time_t*>
|
||||
(&gpu_info.end_time)), "%c %Z") << "\n";
|
||||
std::cout << "| Total Execution Time (sec) | "
|
||||
<< (gpu_info.end_time-gpu_info.start_time) << "\n";
|
||||
std::cout << "+------- Performance Stats --------"
|
||||
@@ -163,37 +263,53 @@ void RdciStatsSubSystem::show_job_stats(
|
||||
std::cout << "| Power Usage (Watts) | " << "Max: "
|
||||
<< gpu_info.power_usage.max_value<< " Min: "<<
|
||||
gpu_info.power_usage.min_value << " Avg: "
|
||||
<< gpu_info.power_usage.average << "\n";
|
||||
<< gpu_info.power_usage.average << " SD: "
|
||||
<< std::fixed << std::setprecision(2)
|
||||
<< gpu_info.power_usage.standard_deviation << "\n";
|
||||
std::cout << "| GPU Clock (MHz) | " << "Max: "
|
||||
<< gpu_info.gpu_clock.max_value << " Min: " <<
|
||||
gpu_info.gpu_clock.min_value << " Avg: "
|
||||
<< gpu_info.gpu_clock.average << "\n";
|
||||
<< gpu_info.gpu_clock.average << " SD: "
|
||||
<< std::fixed << std::setprecision(2)
|
||||
<< gpu_info.gpu_clock.standard_deviation << "\n";
|
||||
std::cout << "| Memory Clock (MHz) | " << "Max: "
|
||||
<< gpu_info.memory_clock.max_value << " Min: " <<
|
||||
gpu_info.memory_clock.min_value << " Avg: "
|
||||
<< gpu_info.memory_clock.average << "\n";
|
||||
<< gpu_info.memory_clock.average << " SD: "
|
||||
<< std::fixed << std::setprecision(2)
|
||||
<< gpu_info.memory_clock.standard_deviation << "\n";
|
||||
std::cout << "| GPU Utilization (%) | " << "Max: "
|
||||
<< gpu_info.gpu_utilization.max_value <<" Min: " <<
|
||||
gpu_info.gpu_utilization.min_value << " Avg: " <<
|
||||
gpu_info.gpu_utilization.average << "\n";
|
||||
gpu_info.gpu_utilization.average << " SD: "
|
||||
<< std::fixed << std::setprecision(2)
|
||||
<< gpu_info.gpu_utilization.standard_deviation << "\n";
|
||||
std::cout << "| Max GPU Memory Used (bytes) | " <<
|
||||
gpu_info.max_gpu_memory_used << "\n";
|
||||
std::cout << "| Memory Utilization (%) | "
|
||||
<< "Max: " << gpu_info.memory_utilization.max_value
|
||||
<<" Min: "<< gpu_info.memory_utilization.min_value
|
||||
<< " Avg: " << gpu_info.memory_utilization.average << "\n";
|
||||
<< " Avg: " << gpu_info.memory_utilization.average << " SD: "
|
||||
<< std::fixed << std::setprecision(2)
|
||||
<< gpu_info.memory_utilization.standard_deviation << "\n";
|
||||
std::cout << "| GPU Temperature (Celsius) | "
|
||||
<< "Max: " << gpu_info.gpu_temperature.max_value
|
||||
<<" Min: "<< gpu_info.gpu_temperature.min_value
|
||||
<< " Avg: " << gpu_info.gpu_temperature.average << "\n";
|
||||
<< " Avg: " << gpu_info.gpu_temperature.average << " SD: "
|
||||
<< std::fixed << std::setprecision(2)
|
||||
<< gpu_info.gpu_temperature.standard_deviation << "\n";
|
||||
std::cout << "| PCIe Rx Bandwidth (megabytes) | "
|
||||
<< "Max: " << gpu_info.pcie_rx.max_value
|
||||
<<" Min: "<< gpu_info.pcie_rx.min_value
|
||||
<< " Avg: " << gpu_info.pcie_rx.average << "\n";
|
||||
<< " Avg: " << gpu_info.pcie_rx.average << " SD: "
|
||||
<< std::fixed << std::setprecision(2)
|
||||
<< gpu_info.pcie_rx.standard_deviation << "\n";
|
||||
std::cout << "| PCIe Tx Bandwidth (megabytes) | "
|
||||
<< "Max: " << gpu_info.pcie_tx.max_value
|
||||
<<" Min: "<< gpu_info.pcie_tx.min_value
|
||||
<< " Avg: " << gpu_info.pcie_tx.average << "\n";
|
||||
<< " Avg: " << gpu_info.pcie_tx.average << " SD: "
|
||||
<< std::fixed << std::setprecision(2)
|
||||
<< gpu_info.pcie_tx.standard_deviation << "\n";
|
||||
std::cout << "| Correctable ECC Errors | "
|
||||
<< gpu_info.ecc_correct << "\n";
|
||||
std::cout << "| Uncorrectable ECC Errors | "
|
||||
@@ -217,8 +333,13 @@ void RdciStatsSubSystem::process() {
|
||||
if (result != RDC_ST_OK) {
|
||||
throw RdcException(result, rdc_status_string(result));
|
||||
}
|
||||
std::cout << "Successfully started recording job "
|
||||
if (is_json_output()) {
|
||||
std::cout << "\"job_id\": \"" << job_id_ << "\", \"group_id\": \""
|
||||
<< group_id_ <<"\", \"status\": \"ok\"";
|
||||
} else {
|
||||
std::cout << "Successfully started recording job "
|
||||
<< job_id_ << " with a group ID " << group_id_ << std::endl;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -228,8 +349,13 @@ void RdciStatsSubSystem::process() {
|
||||
if (result != RDC_ST_OK) {
|
||||
throw RdcException(result, rdc_status_string(result));
|
||||
}
|
||||
std::cout << "Successfully stopped recording job "
|
||||
if (is_json_output()) {
|
||||
std::cout << "\"job_id\": \"" << job_id_
|
||||
<< "\", \"status\": \"ok\"";
|
||||
} else {
|
||||
std::cout << "Successfully stopped recording job "
|
||||
<< job_id_ << std::endl;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -241,14 +367,26 @@ void RdciStatsSubSystem::process() {
|
||||
throw RdcException(result, rdc_status_string(result));
|
||||
}
|
||||
|
||||
std::cout << "| Summary \n";
|
||||
show_job_stats(job_info.summary);
|
||||
if (!is_json_output()) {
|
||||
std::cout << "| Summary \n";
|
||||
show_job_stats(job_info.summary);
|
||||
} else {
|
||||
std::cout << "\"job_summary\" : {";
|
||||
show_job_stats_json(job_info.summary);
|
||||
std::cout << "}";
|
||||
}
|
||||
if (is_verbose_ == false) {
|
||||
return;
|
||||
}
|
||||
for (uint32_t i = 0; i < job_info.num_gpus; i++) {
|
||||
std::cout << "| GPU " << i << "\n";
|
||||
show_job_stats(job_info.gpus[i]);
|
||||
if (!is_json_output()) {
|
||||
std::cout << "| GPU " << i << "\n";
|
||||
show_job_stats(job_info.gpus[i]);
|
||||
} else {
|
||||
std:: cout << ", \"gpu_" << i << "\": {";
|
||||
show_job_stats_json(job_info.gpus[i]);
|
||||
std::cout << "}";
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -259,8 +397,13 @@ void RdciStatsSubSystem::process() {
|
||||
if (result != RDC_ST_OK) {
|
||||
throw RdcException(result, rdc_status_string(result));
|
||||
}
|
||||
std::cout << "Successfully removed job "
|
||||
if (is_json_output()) {
|
||||
std::cout << "\"job_id\": \"" << job_id_
|
||||
<< "\", \"status\": \"ok\"";
|
||||
} else {
|
||||
std::cout << "Successfully removed job "
|
||||
<< job_id_ << std::endl;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -269,7 +412,11 @@ void RdciStatsSubSystem::process() {
|
||||
if (result != RDC_ST_OK) {
|
||||
throw RdcException(result, rdc_status_string(result));
|
||||
}
|
||||
std::cout << "Successfully removed all jobs\n";
|
||||
if (is_json_output()) {
|
||||
std::cout << "\"status\": \"ok\"";
|
||||
} else {
|
||||
std::cout << "Successfully removed all jobs\n";
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,13 +33,18 @@ RdciSubSystem::RdciSubSystem():
|
||||
, use_auth_(true)
|
||||
, root_ca_("/etc/rdc/client/certs/rdc_cacert.pem")
|
||||
, client_cert_("/etc/rdc/client/certs/rdc_client_cert.pem")
|
||||
, client_key_("/etc/rdc/client/private/rdc_client_cert.key") {
|
||||
, client_key_("/etc/rdc/client/private/rdc_client_cert.key")
|
||||
, is_json_output_(false) {
|
||||
rdc_status_t status = rdc_init(0);
|
||||
if (status != RDC_ST_OK) {
|
||||
throw RdcException(status, "RDC initialize fail");
|
||||
}
|
||||
}
|
||||
|
||||
bool RdciSubSystem::is_json_output() const {
|
||||
return is_json_output_;
|
||||
}
|
||||
|
||||
bool RdciSubSystem::get_field_id_from_name(
|
||||
const std::string& name, uint32_t& value) const {
|
||||
const std::map<std::string, uint32_t> field_name_to_id = {
|
||||
@@ -154,6 +159,11 @@ void RdciSubSystem::show_common_usage() const {
|
||||
<< "information and exits.\n";
|
||||
}
|
||||
|
||||
void RdciSubSystem::set_json_output(bool is_json) {
|
||||
is_json_output_ = is_json;
|
||||
std::cout << "{";
|
||||
}
|
||||
|
||||
RdciSubSystem::~RdciSubSystem() {
|
||||
if (rdc_handle_) {
|
||||
rdc_disconnect(rdc_handle_);
|
||||
@@ -161,6 +171,10 @@ RdciSubSystem::~RdciSubSystem() {
|
||||
}
|
||||
|
||||
rdc_shutdown();
|
||||
|
||||
if (is_json_output_) {
|
||||
std::cout << "}" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
|
||||
@@ -42,9 +42,9 @@ int main(int argc, char ** argv) {
|
||||
exit(0);
|
||||
}
|
||||
|
||||
amd::rdc::RdciSubSystemPtr subsystem;
|
||||
try {
|
||||
std::string subsystem_name = argv[1];
|
||||
amd::rdc::RdciSubSystemPtr subsystem;
|
||||
if (subsystem_name == "discovery") {
|
||||
subsystem.reset(new amd::rdc::RdciDiscoverySubSystem());
|
||||
} else if (subsystem_name == "dmon") {
|
||||
@@ -66,11 +66,20 @@ int main(int argc, char ** argv) {
|
||||
|
||||
subsystem->process();
|
||||
} catch (const amd::rdc::RdcException& e) {
|
||||
std::cout << "rdci Error: " << e.what() << std::endl;
|
||||
if (subsystem && subsystem->is_json_output()) {
|
||||
std::cout << "\"status\": \"error\", \"description\": \""
|
||||
<< e.what() << '"';
|
||||
} else {
|
||||
std::cout << "rdci Error: " << e.what() << std::endl;
|
||||
}
|
||||
return e.error_code();
|
||||
} catch (...) {
|
||||
std::cout << "Unhandled exception." << std::endl;
|
||||
return 1;
|
||||
if (subsystem && subsystem->is_json_output()) {
|
||||
std::cout << "\"status\": \"error\", \"description\": "
|
||||
<< "\"Unhandled exception.\"";
|
||||
} else {
|
||||
std::cout << "Unhandled exception." << std::endl;
|
||||
} return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -30,6 +30,8 @@ THE SOFTWARE.
|
||||
#include "rdc.grpc.pb.h" // NOLINT
|
||||
#include "rdc/rdc_api_service.h"
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc_lib/RdcLogger.h"
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
@@ -488,41 +490,49 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src,
|
||||
stats->set_max_value(src.power_usage.max_value);
|
||||
stats->set_min_value(src.power_usage.min_value);
|
||||
stats->set_average(src.power_usage.average);
|
||||
stats->set_standard_deviation(src.power_usage.standard_deviation);
|
||||
|
||||
stats = target->mutable_gpu_clock();
|
||||
stats->set_max_value(src.gpu_clock.max_value);
|
||||
stats->set_min_value(src.gpu_clock.min_value);
|
||||
stats->set_average(src.gpu_clock.average);
|
||||
stats->set_standard_deviation(src.gpu_clock.standard_deviation);
|
||||
|
||||
stats = target->mutable_gpu_utilization();
|
||||
stats->set_max_value(src.gpu_utilization.max_value);
|
||||
stats->set_min_value(src.gpu_utilization.min_value);
|
||||
stats->set_average(src.gpu_utilization.average);
|
||||
stats->set_standard_deviation(src.gpu_utilization.standard_deviation);
|
||||
|
||||
stats = target->mutable_memory_utilization();
|
||||
stats->set_max_value(src.memory_utilization.max_value);
|
||||
stats->set_min_value(src.memory_utilization.min_value);
|
||||
stats->set_average(src.memory_utilization.average);
|
||||
stats->set_standard_deviation(src.memory_utilization.standard_deviation);
|
||||
|
||||
stats = target->mutable_pcie_tx();
|
||||
stats->set_max_value(src.pcie_tx.max_value);
|
||||
stats->set_min_value(src.pcie_tx.min_value);
|
||||
stats->set_average(src.pcie_tx.average);
|
||||
stats->set_standard_deviation(src.pcie_tx.standard_deviation);
|
||||
|
||||
stats = target->mutable_pcie_rx();
|
||||
stats->set_max_value(src.pcie_rx.max_value);
|
||||
stats->set_min_value(src.pcie_rx.min_value);
|
||||
stats->set_average(src.pcie_rx.average);
|
||||
stats->set_standard_deviation(src.pcie_rx.standard_deviation);
|
||||
|
||||
stats = target->mutable_memory_clock();
|
||||
stats->set_max_value(src.memory_clock.max_value);
|
||||
stats->set_min_value(src.memory_clock.min_value);
|
||||
stats->set_average(src.memory_clock.average);
|
||||
stats->set_standard_deviation(src.memory_clock.standard_deviation);
|
||||
|
||||
stats = target->mutable_gpu_temperature();
|
||||
stats->set_max_value(src.gpu_temperature.max_value);
|
||||
stats->set_min_value(src.gpu_temperature.min_value);
|
||||
stats->set_average(src.gpu_temperature.average);
|
||||
stats->set_standard_deviation(src.gpu_temperature.standard_deviation);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -312,6 +312,12 @@ RDCServer::ShutDown(void) {
|
||||
delete rdc_admin_service_;
|
||||
rdc_admin_service_ = nullptr;
|
||||
}
|
||||
|
||||
if (api_service_) {
|
||||
delete api_service_;
|
||||
api_service_ = nullptr;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void * ProcessSignalLoop(void *server_ptr) {
|
||||
|
||||
Referens i nytt ärende
Block a user