From 0f8f345992cd56b69a3eb2faf493916660c8f2bb Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Tue, 19 May 2020 09:38:50 -0400 Subject: [PATCH] Add support to use the field name in rdci In the rdci dmon and fieldgroup, now the fields can be specified using either number id or the field name. When the rdc is async fetching metrics, it will not report that fetch as an error. Change-Id: I81331e2c239af987181147be5ac0e29ba1617ab4 [ROCm/rdc commit: d30cb81fdb4695b1a36c1b354c0da56422341731] --- .../rdc_lib/impl/RdcMetricFetcherImpl.h | 4 ++- .../rdc_libs/rdc/src/RdcMetricFetcherImpl.cc | 18 ++++++++---- projects/rdc/rdci/include/RdciSubSystem.h | 2 ++ projects/rdc/rdci/src/RdciDmonSubSystem.cc | 11 ++++++-- .../rdc/rdci/src/RdciFieldGroupSubSystem.cc | 9 ++++-- projects/rdc/rdci/src/RdciSubSystem.cc | 28 +++++++++++++++++++ 6 files changed, 60 insertions(+), 12 deletions(-) diff --git a/projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h b/projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h index 886c58e84e..154e466f11 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h @@ -60,7 +60,9 @@ class RdcMetricFetcherImpl: public RdcMetricFetcher { uint64_t now(); void get_ecc_error(uint32_t gpu_index, uint32_t field_id, rdc_field_value* value); - void async_get_pcie_throughput(uint32_t gpu_index, + + //!< return true if starting async_get + bool async_get_pcie_throughput(uint32_t gpu_index, uint32_t field_id, rdc_field_value* value); void get_pcie_throughput(const RdcFieldKey& key); diff --git a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index ddd9476e20..269dff471c 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -120,10 +120,10 @@ void RdcMetricFetcherImpl::get_ecc_error(uint32_t gpu_index, } } -void RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index, +bool RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index, uint32_t field_id, rdc_field_value* value) { if (!value) { - return; + return false; } do { @@ -136,7 +136,7 @@ void RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index, value->status = metric->second.value.status; value->type = metric->second.value.type; value->value = metric->second.value.value; - return; + return false; } } @@ -150,6 +150,8 @@ void RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index, field_id_string(field_id) << " to cache."); } while (0); cv_.notify_all(); + + return true; } void RdcMetricFetcherImpl::get_pcie_throughput(const RdcFieldKey& key) { @@ -221,6 +223,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, uint64_t i64 = 0; rsmi_temperature_type_t sensor_type; rsmi_clk_type_t clk_type; + bool async_fetching = false; if (!is_field_valid(field_id)) { RDC_LOG(RDC_ERROR, "Fail to fetch field " << field_id @@ -313,7 +316,8 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, break; case RDC_FI_PCIE_TX: case RDC_FI_PCIE_RX: - async_get_pcie_throughput(gpu_index, field_id, value); + async_fetching = async_get_pcie_throughput( + gpu_index, field_id, value); break; default: break; @@ -321,9 +325,13 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, int64_t latency = now()-value->ts; if (value->status != RSMI_STATUS_SUCCESS) { - RDC_LOG(RDC_ERROR, "Fail to fetch " << gpu_index << ":" << + if (async_fetching) { //!< Async fetching is not an error + RDC_LOG(RDC_DEBUG, "Async fetch " << field_id_string(field_id)); + } else { + RDC_LOG(RDC_ERROR, "Fail to fetch " << gpu_index << ":" << field_id_string(field_id) << " with rsmi error code " << value->status <<", latency " << latency); + } } else if (value->type == INTEGER) { RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" << field_id_string(field_id) << ":" << value->value.l_int diff --git a/projects/rdc/rdci/include/RdciSubSystem.h b/projects/rdc/rdci/include/RdciSubSystem.h index c5a6d845ab..6eb67f838e 100644 --- a/projects/rdc/rdci/include/RdciSubSystem.h +++ b/projects/rdc/rdci/include/RdciSubSystem.h @@ -43,6 +43,8 @@ class RdciSubSystem { std::vector split_string(const std::string& s, char delimiter) const; void show_common_usage() const; + bool get_field_id_from_name(const std::string& name, + uint32_t& value) const; // NOLINT(runtime/references) rdc_handle_t rdc_handle_; std::string ip_port_; diff --git a/projects/rdc/rdci/src/RdciDmonSubSystem.cc b/projects/rdc/rdci/src/RdciDmonSubSystem.cc index 5d30fe5246..6c07ac4061 100644 --- a/projects/rdc/rdci/src/RdciDmonSubSystem.cc +++ b/projects/rdc/rdci/src/RdciDmonSubSystem.cc @@ -147,10 +147,15 @@ void RdciDmonSubSystem::parse_cmd_opts(int argc, char ** argv) { std::vector vec_ids = split_string(field_ids, ','); for (uint32_t i = 0; i < vec_ids.size(); i++) { if (!IsNumber(vec_ids[i])) { - throw RdcException(RDC_ST_BAD_PARAMETER, "The field Id " - +vec_ids[i]+" needs to be a number"); + uint32_t field_id = 0; + if (!get_field_id_from_name(vec_ids[i], field_id)) { + throw RdcException(RDC_ST_BAD_PARAMETER, + "The field name "+vec_ids[i]+" is not valid"); + } + field_ids_.push_back(field_id); + } else { + field_ids_.push_back(std::stoi(vec_ids[i])); } - field_ids_.push_back(std::stoi(vec_ids[i])); } } } diff --git a/projects/rdc/rdci/src/RdciFieldGroupSubSystem.cc b/projects/rdc/rdci/src/RdciFieldGroupSubSystem.cc index 4961cd6698..f2e5d10383 100644 --- a/projects/rdc/rdci/src/RdciFieldGroupSubSystem.cc +++ b/projects/rdc/rdci/src/RdciFieldGroupSubSystem.cc @@ -158,10 +158,13 @@ void RdciFieldGroupSubSystem::process() { uint32_t field_ids[RDC_MAX_FIELD_IDS_PER_FIELD_GROUP]; for (uint32_t i = 0; i < fields.size(); i++) { if (!IsNumber(fields[i])) { - throw RdcException(RDC_ST_BAD_PARAMETER, - "The field Id "+fields[i]+" needs to be a number"); + if (!get_field_id_from_name(fields[i], field_ids[i])) { + throw RdcException(RDC_ST_BAD_PARAMETER, + "The field name "+fields[i]+" is not valid"); + } + } else { + field_ids[i] = std::stoi(fields[i]); } - field_ids[i] = std::stoi(fields[i]); } rdc_field_grp_t group_id; result = rdc_group_field_create(rdc_handle_, fields.size(), diff --git a/projects/rdc/rdci/src/RdciSubSystem.cc b/projects/rdc/rdci/src/RdciSubSystem.cc index f52d8de544..8d324d4fab 100644 --- a/projects/rdc/rdci/src/RdciSubSystem.cc +++ b/projects/rdc/rdci/src/RdciSubSystem.cc @@ -40,6 +40,34 @@ RdciSubSystem::RdciSubSystem(): } } +bool RdciSubSystem::get_field_id_from_name( + const std::string& name, uint32_t& value) const { + const std::map field_name_to_id = { + {"RDC_FI_GPU_MEMORY_USAGE", 525}, + {"RDC_FI_GPU_MEMORY_TOTAL", 580}, + {"RDC_FI_POWER_USAGE", 155}, + {"RDC_FI_GPU_CLOCK", 100}, + {"RDC_FI_MEM_CLOCK", 101}, + {"RDC_FI_PCIE_TX", 200}, + {"RDC_FI_PCIE_RX", 201}, + {"RDC_FI_GPU_UTIL", 203}, + {"RDC_FI_ECC_CORRECT_TOTAL", 312}, + {"RDC_FI_ECC_UNCORRECT_TOTAL", 313}, + {"RDC_FI_MEMORY_TEMP", 140}, + {"RDC_FI_GPU_TEMP", 150}, + {"RDC_FI_GPU_COUNT", 4}, + {"RDC_FI_DEV_NAME", 50} + }; + + auto id = field_name_to_id.find(name); + if (id == field_name_to_id.end()) { + return false; + } + + value = id->second; + return true; +} + std::vector RdciSubSystem::split_string(const std::string& s, char delimiter) const { std::vector tokens;