Add support to use the field name in rdci

In the rdci dmon and fieldgroup, now the fields can be specified
using either number id or the field name.

When the rdc is async fetching metrics, it will not report that fetch
as an error.

Change-Id: I81331e2c239af987181147be5ac0e29ba1617ab4


[ROCm/rdc commit: d30cb81fdb]
This commit is contained in:
Bill(Shuzhou) Liu
2020-05-19 09:38:50 -04:00
committed by Chris Freehill
parent aa0a40f84d
commit 0f8f345992
6 changed files with 60 additions and 12 deletions
@@ -60,7 +60,9 @@ class RdcMetricFetcherImpl: public RdcMetricFetcher {
uint64_t now();
void get_ecc_error(uint32_t gpu_index,
uint32_t field_id, rdc_field_value* value);
void async_get_pcie_throughput(uint32_t gpu_index,
//!< return true if starting async_get
bool async_get_pcie_throughput(uint32_t gpu_index,
uint32_t field_id, rdc_field_value* value);
void get_pcie_throughput(const RdcFieldKey& key);
@@ -120,10 +120,10 @@ void RdcMetricFetcherImpl::get_ecc_error(uint32_t gpu_index,
}
}
void RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index,
bool RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index,
uint32_t field_id, rdc_field_value* value) {
if (!value) {
return;
return false;
}
do {
@@ -136,7 +136,7 @@ void RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index,
value->status = metric->second.value.status;
value->type = metric->second.value.type;
value->value = metric->second.value.value;
return;
return false;
}
}
@@ -150,6 +150,8 @@ void RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index,
field_id_string(field_id) << " to cache.");
} while (0);
cv_.notify_all();
return true;
}
void RdcMetricFetcherImpl::get_pcie_throughput(const RdcFieldKey& key) {
@@ -221,6 +223,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index,
uint64_t i64 = 0;
rsmi_temperature_type_t sensor_type;
rsmi_clk_type_t clk_type;
bool async_fetching = false;
if (!is_field_valid(field_id)) {
RDC_LOG(RDC_ERROR, "Fail to fetch field " << field_id
@@ -313,7 +316,8 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index,
break;
case RDC_FI_PCIE_TX:
case RDC_FI_PCIE_RX:
async_get_pcie_throughput(gpu_index, field_id, value);
async_fetching = async_get_pcie_throughput(
gpu_index, field_id, value);
break;
default:
break;
@@ -321,9 +325,13 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index,
int64_t latency = now()-value->ts;
if (value->status != RSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "Fail to fetch " << gpu_index << ":" <<
if (async_fetching) { //!< Async fetching is not an error
RDC_LOG(RDC_DEBUG, "Async fetch " << field_id_string(field_id));
} else {
RDC_LOG(RDC_ERROR, "Fail to fetch " << gpu_index << ":" <<
field_id_string(field_id) << " with rsmi error code "
<< value->status <<", latency " << latency);
}
} else if (value->type == INTEGER) {
RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" <<
field_id_string(field_id) << ":" << value->value.l_int
@@ -43,6 +43,8 @@ class RdciSubSystem {
std::vector<std::string> split_string(const std::string& s,
char delimiter) const;
void show_common_usage() const;
bool get_field_id_from_name(const std::string& name,
uint32_t& value) const; // NOLINT(runtime/references)
rdc_handle_t rdc_handle_;
std::string ip_port_;
+8 -3
View File
@@ -147,10 +147,15 @@ void RdciDmonSubSystem::parse_cmd_opts(int argc, char ** argv) {
std::vector<std::string> vec_ids = split_string(field_ids, ',');
for (uint32_t i = 0; i < vec_ids.size(); i++) {
if (!IsNumber(vec_ids[i])) {
throw RdcException(RDC_ST_BAD_PARAMETER, "The field Id "
+vec_ids[i]+" needs to be a number");
uint32_t field_id = 0;
if (!get_field_id_from_name(vec_ids[i], field_id)) {
throw RdcException(RDC_ST_BAD_PARAMETER,
"The field name "+vec_ids[i]+" is not valid");
}
field_ids_.push_back(field_id);
} else {
field_ids_.push_back(std::stoi(vec_ids[i]));
}
field_ids_.push_back(std::stoi(vec_ids[i]));
}
}
}
@@ -158,10 +158,13 @@ void RdciFieldGroupSubSystem::process() {
uint32_t field_ids[RDC_MAX_FIELD_IDS_PER_FIELD_GROUP];
for (uint32_t i = 0; i < fields.size(); i++) {
if (!IsNumber(fields[i])) {
throw RdcException(RDC_ST_BAD_PARAMETER,
"The field Id "+fields[i]+" needs to be a number");
if (!get_field_id_from_name(fields[i], field_ids[i])) {
throw RdcException(RDC_ST_BAD_PARAMETER,
"The field name "+fields[i]+" is not valid");
}
} else {
field_ids[i] = std::stoi(fields[i]);
}
field_ids[i] = std::stoi(fields[i]);
}
rdc_field_grp_t group_id;
result = rdc_group_field_create(rdc_handle_, fields.size(),
+28
View File
@@ -40,6 +40,34 @@ RdciSubSystem::RdciSubSystem():
}
}
bool RdciSubSystem::get_field_id_from_name(
const std::string& name, uint32_t& value) const {
const std::map<std::string, uint32_t> field_name_to_id = {
{"RDC_FI_GPU_MEMORY_USAGE", 525},
{"RDC_FI_GPU_MEMORY_TOTAL", 580},
{"RDC_FI_POWER_USAGE", 155},
{"RDC_FI_GPU_CLOCK", 100},
{"RDC_FI_MEM_CLOCK", 101},
{"RDC_FI_PCIE_TX", 200},
{"RDC_FI_PCIE_RX", 201},
{"RDC_FI_GPU_UTIL", 203},
{"RDC_FI_ECC_CORRECT_TOTAL", 312},
{"RDC_FI_ECC_UNCORRECT_TOTAL", 313},
{"RDC_FI_MEMORY_TEMP", 140},
{"RDC_FI_GPU_TEMP", 150},
{"RDC_FI_GPU_COUNT", 4},
{"RDC_FI_DEV_NAME", 50}
};
auto id = field_name_to_id.find(name);
if (id == field_name_to_id.end()) {
return false;
}
value = id->second;
return true;
}
std::vector<std::string> RdciSubSystem::split_string(const std::string& s,
char delimiter) const {
std::vector<std::string> tokens;