diff --git a/projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h b/projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h index 886c58e84e..154e466f11 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h @@ -60,7 +60,9 @@ class RdcMetricFetcherImpl: public RdcMetricFetcher { uint64_t now(); void get_ecc_error(uint32_t gpu_index, uint32_t field_id, rdc_field_value* value); - void async_get_pcie_throughput(uint32_t gpu_index, + + //!< return true if starting async_get + bool async_get_pcie_throughput(uint32_t gpu_index, uint32_t field_id, rdc_field_value* value); void get_pcie_throughput(const RdcFieldKey& key); diff --git a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index ddd9476e20..269dff471c 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -120,10 +120,10 @@ void RdcMetricFetcherImpl::get_ecc_error(uint32_t gpu_index, } } -void RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index, +bool RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index, uint32_t field_id, rdc_field_value* value) { if (!value) { - return; + return false; } do { @@ -136,7 +136,7 @@ void RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index, value->status = metric->second.value.status; value->type = metric->second.value.type; value->value = metric->second.value.value; - return; + return false; } } @@ -150,6 +150,8 @@ void RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index, field_id_string(field_id) << " to cache."); } while (0); cv_.notify_all(); + + return true; } void RdcMetricFetcherImpl::get_pcie_throughput(const RdcFieldKey& key) { @@ -221,6 +223,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, uint64_t i64 = 0; rsmi_temperature_type_t sensor_type; rsmi_clk_type_t clk_type; + bool async_fetching = false; if (!is_field_valid(field_id)) { RDC_LOG(RDC_ERROR, "Fail to fetch field " << field_id @@ -313,7 +316,8 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, break; case RDC_FI_PCIE_TX: case RDC_FI_PCIE_RX: - async_get_pcie_throughput(gpu_index, field_id, value); + async_fetching = async_get_pcie_throughput( + gpu_index, field_id, value); break; default: break; @@ -321,9 +325,13 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, int64_t latency = now()-value->ts; if (value->status != RSMI_STATUS_SUCCESS) { - RDC_LOG(RDC_ERROR, "Fail to fetch " << gpu_index << ":" << + if (async_fetching) { //!< Async fetching is not an error + RDC_LOG(RDC_DEBUG, "Async fetch " << field_id_string(field_id)); + } else { + RDC_LOG(RDC_ERROR, "Fail to fetch " << gpu_index << ":" << field_id_string(field_id) << " with rsmi error code " << value->status <<", latency " << latency); + } } else if (value->type == INTEGER) { RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" << field_id_string(field_id) << ":" << value->value.l_int diff --git a/projects/rdc/rdci/include/RdciSubSystem.h b/projects/rdc/rdci/include/RdciSubSystem.h index c5a6d845ab..6eb67f838e 100644 --- a/projects/rdc/rdci/include/RdciSubSystem.h +++ b/projects/rdc/rdci/include/RdciSubSystem.h @@ -43,6 +43,8 @@ class RdciSubSystem { std::vector split_string(const std::string& s, char delimiter) const; void show_common_usage() const; + bool get_field_id_from_name(const std::string& name, + uint32_t& value) const; // NOLINT(runtime/references) rdc_handle_t rdc_handle_; std::string ip_port_; diff --git a/projects/rdc/rdci/src/RdciDmonSubSystem.cc b/projects/rdc/rdci/src/RdciDmonSubSystem.cc index 5d30fe5246..6c07ac4061 100644 --- a/projects/rdc/rdci/src/RdciDmonSubSystem.cc +++ b/projects/rdc/rdci/src/RdciDmonSubSystem.cc @@ -147,10 +147,15 @@ void RdciDmonSubSystem::parse_cmd_opts(int argc, char ** argv) { std::vector vec_ids = split_string(field_ids, ','); for (uint32_t i = 0; i < vec_ids.size(); i++) { if (!IsNumber(vec_ids[i])) { - throw RdcException(RDC_ST_BAD_PARAMETER, "The field Id " - +vec_ids[i]+" needs to be a number"); + uint32_t field_id = 0; + if (!get_field_id_from_name(vec_ids[i], field_id)) { + throw RdcException(RDC_ST_BAD_PARAMETER, + "The field name "+vec_ids[i]+" is not valid"); + } + field_ids_.push_back(field_id); + } else { + field_ids_.push_back(std::stoi(vec_ids[i])); } - field_ids_.push_back(std::stoi(vec_ids[i])); } } } diff --git a/projects/rdc/rdci/src/RdciFieldGroupSubSystem.cc b/projects/rdc/rdci/src/RdciFieldGroupSubSystem.cc index 4961cd6698..f2e5d10383 100644 --- a/projects/rdc/rdci/src/RdciFieldGroupSubSystem.cc +++ b/projects/rdc/rdci/src/RdciFieldGroupSubSystem.cc @@ -158,10 +158,13 @@ void RdciFieldGroupSubSystem::process() { uint32_t field_ids[RDC_MAX_FIELD_IDS_PER_FIELD_GROUP]; for (uint32_t i = 0; i < fields.size(); i++) { if (!IsNumber(fields[i])) { - throw RdcException(RDC_ST_BAD_PARAMETER, - "The field Id "+fields[i]+" needs to be a number"); + if (!get_field_id_from_name(fields[i], field_ids[i])) { + throw RdcException(RDC_ST_BAD_PARAMETER, + "The field name "+fields[i]+" is not valid"); + } + } else { + field_ids[i] = std::stoi(fields[i]); } - field_ids[i] = std::stoi(fields[i]); } rdc_field_grp_t group_id; result = rdc_group_field_create(rdc_handle_, fields.size(), diff --git a/projects/rdc/rdci/src/RdciSubSystem.cc b/projects/rdc/rdci/src/RdciSubSystem.cc index f52d8de544..8d324d4fab 100644 --- a/projects/rdc/rdci/src/RdciSubSystem.cc +++ b/projects/rdc/rdci/src/RdciSubSystem.cc @@ -40,6 +40,34 @@ RdciSubSystem::RdciSubSystem(): } } +bool RdciSubSystem::get_field_id_from_name( + const std::string& name, uint32_t& value) const { + const std::map field_name_to_id = { + {"RDC_FI_GPU_MEMORY_USAGE", 525}, + {"RDC_FI_GPU_MEMORY_TOTAL", 580}, + {"RDC_FI_POWER_USAGE", 155}, + {"RDC_FI_GPU_CLOCK", 100}, + {"RDC_FI_MEM_CLOCK", 101}, + {"RDC_FI_PCIE_TX", 200}, + {"RDC_FI_PCIE_RX", 201}, + {"RDC_FI_GPU_UTIL", 203}, + {"RDC_FI_ECC_CORRECT_TOTAL", 312}, + {"RDC_FI_ECC_UNCORRECT_TOTAL", 313}, + {"RDC_FI_MEMORY_TEMP", 140}, + {"RDC_FI_GPU_TEMP", 150}, + {"RDC_FI_GPU_COUNT", 4}, + {"RDC_FI_DEV_NAME", 50} + }; + + auto id = field_name_to_id.find(name); + if (id == field_name_to_id.end()) { + return false; + } + + value = id->second; + return true; +} + std::vector RdciSubSystem::split_string(const std::string& s, char delimiter) const { std::vector tokens;