Add support to use the field name in rdci
In the rdci dmon and fieldgroup, now the fields can be specified
using either number id or the field name.
When the rdc is async fetching metrics, it will not report that fetch
as an error.
Change-Id: I81331e2c239af987181147be5ac0e29ba1617ab4
[ROCm/rdc commit: d30cb81fdb]
This commit is contained in:
committed by
Chris Freehill
parent
aa0a40f84d
commit
0f8f345992
@@ -60,7 +60,9 @@ class RdcMetricFetcherImpl: public RdcMetricFetcher {
|
||||
uint64_t now();
|
||||
void get_ecc_error(uint32_t gpu_index,
|
||||
uint32_t field_id, rdc_field_value* value);
|
||||
void async_get_pcie_throughput(uint32_t gpu_index,
|
||||
|
||||
//!< return true if starting async_get
|
||||
bool async_get_pcie_throughput(uint32_t gpu_index,
|
||||
uint32_t field_id, rdc_field_value* value);
|
||||
void get_pcie_throughput(const RdcFieldKey& key);
|
||||
|
||||
|
||||
@@ -120,10 +120,10 @@ void RdcMetricFetcherImpl::get_ecc_error(uint32_t gpu_index,
|
||||
}
|
||||
}
|
||||
|
||||
void RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index,
|
||||
bool RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index,
|
||||
uint32_t field_id, rdc_field_value* value) {
|
||||
if (!value) {
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
do {
|
||||
@@ -136,7 +136,7 @@ void RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index,
|
||||
value->status = metric->second.value.status;
|
||||
value->type = metric->second.value.type;
|
||||
value->value = metric->second.value.value;
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -150,6 +150,8 @@ void RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index,
|
||||
field_id_string(field_id) << " to cache.");
|
||||
} while (0);
|
||||
cv_.notify_all();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void RdcMetricFetcherImpl::get_pcie_throughput(const RdcFieldKey& key) {
|
||||
@@ -221,6 +223,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index,
|
||||
uint64_t i64 = 0;
|
||||
rsmi_temperature_type_t sensor_type;
|
||||
rsmi_clk_type_t clk_type;
|
||||
bool async_fetching = false;
|
||||
|
||||
if (!is_field_valid(field_id)) {
|
||||
RDC_LOG(RDC_ERROR, "Fail to fetch field " << field_id
|
||||
@@ -313,7 +316,8 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index,
|
||||
break;
|
||||
case RDC_FI_PCIE_TX:
|
||||
case RDC_FI_PCIE_RX:
|
||||
async_get_pcie_throughput(gpu_index, field_id, value);
|
||||
async_fetching = async_get_pcie_throughput(
|
||||
gpu_index, field_id, value);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@@ -321,9 +325,13 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index,
|
||||
|
||||
int64_t latency = now()-value->ts;
|
||||
if (value->status != RSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR, "Fail to fetch " << gpu_index << ":" <<
|
||||
if (async_fetching) { //!< Async fetching is not an error
|
||||
RDC_LOG(RDC_DEBUG, "Async fetch " << field_id_string(field_id));
|
||||
} else {
|
||||
RDC_LOG(RDC_ERROR, "Fail to fetch " << gpu_index << ":" <<
|
||||
field_id_string(field_id) << " with rsmi error code "
|
||||
<< value->status <<", latency " << latency);
|
||||
}
|
||||
} else if (value->type == INTEGER) {
|
||||
RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" <<
|
||||
field_id_string(field_id) << ":" << value->value.l_int
|
||||
|
||||
@@ -43,6 +43,8 @@ class RdciSubSystem {
|
||||
std::vector<std::string> split_string(const std::string& s,
|
||||
char delimiter) const;
|
||||
void show_common_usage() const;
|
||||
bool get_field_id_from_name(const std::string& name,
|
||||
uint32_t& value) const; // NOLINT(runtime/references)
|
||||
rdc_handle_t rdc_handle_;
|
||||
std::string ip_port_;
|
||||
|
||||
|
||||
@@ -147,10 +147,15 @@ void RdciDmonSubSystem::parse_cmd_opts(int argc, char ** argv) {
|
||||
std::vector<std::string> vec_ids = split_string(field_ids, ',');
|
||||
for (uint32_t i = 0; i < vec_ids.size(); i++) {
|
||||
if (!IsNumber(vec_ids[i])) {
|
||||
throw RdcException(RDC_ST_BAD_PARAMETER, "The field Id "
|
||||
+vec_ids[i]+" needs to be a number");
|
||||
uint32_t field_id = 0;
|
||||
if (!get_field_id_from_name(vec_ids[i], field_id)) {
|
||||
throw RdcException(RDC_ST_BAD_PARAMETER,
|
||||
"The field name "+vec_ids[i]+" is not valid");
|
||||
}
|
||||
field_ids_.push_back(field_id);
|
||||
} else {
|
||||
field_ids_.push_back(std::stoi(vec_ids[i]));
|
||||
}
|
||||
field_ids_.push_back(std::stoi(vec_ids[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -158,10 +158,13 @@ void RdciFieldGroupSubSystem::process() {
|
||||
uint32_t field_ids[RDC_MAX_FIELD_IDS_PER_FIELD_GROUP];
|
||||
for (uint32_t i = 0; i < fields.size(); i++) {
|
||||
if (!IsNumber(fields[i])) {
|
||||
throw RdcException(RDC_ST_BAD_PARAMETER,
|
||||
"The field Id "+fields[i]+" needs to be a number");
|
||||
if (!get_field_id_from_name(fields[i], field_ids[i])) {
|
||||
throw RdcException(RDC_ST_BAD_PARAMETER,
|
||||
"The field name "+fields[i]+" is not valid");
|
||||
}
|
||||
} else {
|
||||
field_ids[i] = std::stoi(fields[i]);
|
||||
}
|
||||
field_ids[i] = std::stoi(fields[i]);
|
||||
}
|
||||
rdc_field_grp_t group_id;
|
||||
result = rdc_group_field_create(rdc_handle_, fields.size(),
|
||||
|
||||
@@ -40,6 +40,34 @@ RdciSubSystem::RdciSubSystem():
|
||||
}
|
||||
}
|
||||
|
||||
bool RdciSubSystem::get_field_id_from_name(
|
||||
const std::string& name, uint32_t& value) const {
|
||||
const std::map<std::string, uint32_t> field_name_to_id = {
|
||||
{"RDC_FI_GPU_MEMORY_USAGE", 525},
|
||||
{"RDC_FI_GPU_MEMORY_TOTAL", 580},
|
||||
{"RDC_FI_POWER_USAGE", 155},
|
||||
{"RDC_FI_GPU_CLOCK", 100},
|
||||
{"RDC_FI_MEM_CLOCK", 101},
|
||||
{"RDC_FI_PCIE_TX", 200},
|
||||
{"RDC_FI_PCIE_RX", 201},
|
||||
{"RDC_FI_GPU_UTIL", 203},
|
||||
{"RDC_FI_ECC_CORRECT_TOTAL", 312},
|
||||
{"RDC_FI_ECC_UNCORRECT_TOTAL", 313},
|
||||
{"RDC_FI_MEMORY_TEMP", 140},
|
||||
{"RDC_FI_GPU_TEMP", 150},
|
||||
{"RDC_FI_GPU_COUNT", 4},
|
||||
{"RDC_FI_DEV_NAME", 50}
|
||||
};
|
||||
|
||||
auto id = field_name_to_id.find(name);
|
||||
if (id == field_name_to_id.end()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
value = id->second;
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<std::string> RdciSubSystem::split_string(const std::string& s,
|
||||
char delimiter) const {
|
||||
std::vector<std::string> tokens;
|
||||
|
||||
Reference in New Issue
Block a user