diff --git a/projects/rdc/include/rdc/rdc.h b/projects/rdc/include/rdc/rdc.h index debde6473b..db70734789 100755 --- a/projects/rdc/include/rdc/rdc.h +++ b/projects/rdc/include/rdc/rdc.h @@ -27,7 +27,11 @@ THE SOFTWARE. extern "C" { #endif // __cplusplus +#ifdef __cplusplus #include +#else +#include +#endif /** \file rdc.h * Main header file for the ROCm RDC library. diff --git a/projects/rdc/include/rdc_lib/RdcTelemetry.h b/projects/rdc/include/rdc_lib/RdcTelemetry.h index 0370643079..1c6bf7e759 100644 --- a/projects/rdc/include/rdc_lib/RdcTelemetry.h +++ b/projects/rdc/include/rdc_lib/RdcTelemetry.h @@ -24,26 +24,11 @@ THE SOFTWARE. #include #include "rdc/rdc.h" +#include "rdc_lib/RdcTelemetryLibInterface.h" namespace amd { namespace rdc { -// Structure to keep both gup index and field value -typedef struct { - uint32_t gpu_index; - rdc_field_value field_value; -} rdc_gpu_field_value_t; - - -typedef struct { - uint32_t gpu_index; - rdc_field_t field_id; -} rdc_gpu_field_t; - -#define MAX_NUM_FIELDS 8192 -typedef rdc_status_t(*rdc_field_value_f)(rdc_gpu_field_value_t* values, - uint32_t num_values, void* user_data); - class RdcTelemetry { public: // get support field ids diff --git a/projects/rdc/include/rdc_lib/RdcTelemetryLibInterface.h b/projects/rdc/include/rdc_lib/RdcTelemetryLibInterface.h new file mode 100644 index 0000000000..97194ed6ed --- /dev/null +++ b/projects/rdc/include/rdc_lib/RdcTelemetryLibInterface.h @@ -0,0 +1,68 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef RDC_LIB_RDCTELEMETRYLIBINTERFACE_H_ +#define RDC_LIB_RDCTELEMETRYLIBINTERFACE_H_ + +// The telemetry interface for libraries, for example, RAS. +#include +#include + +extern "C" { + +// Structure to keep both gup index and field value +typedef struct { + uint32_t gpu_index; + rdc_field_value field_value; +} rdc_gpu_field_value_t; + +typedef struct { + uint32_t gpu_index; + rdc_field_t field_id; +} rdc_gpu_field_t; + +#define MAX_NUM_FIELDS 8192 +typedef rdc_status_t(*rdc_field_value_f)(rdc_gpu_field_value_t* values, + uint32_t num_values, void* user_data); + +// The library will implement below function + +rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], + uint32_t* field_count); + +rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, + uint32_t fields_count, rdc_field_value_f callback, + void* user_data); + + +rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, + uint32_t fields_count); + +rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, + uint32_t fields_count); + +rdc_status_t rdc_module_init(uint64_t flags); + +rdc_status_t rdc_module_destroy(); + +} + +#endif // RDC_LIB_RDCTELEMETRYLIBINTERFACE_H_ diff --git a/projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h b/projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h index 4207f2fe62..2274c382f4 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h @@ -24,6 +24,7 @@ THE SOFTWARE. #include // NOLINT(build/c++11) #include // NOLINT(build/c++11) +#include #include // NOLINT(build/c++11) #include #include @@ -54,7 +55,7 @@ struct FieldRSMIData { union { rsmi_counter_value_t counter_val; }; - ~FieldRSMIData(){} + ~FieldRSMIData() {} FieldRSMIData() : evt_handle(0), counter_val{0, 0, 0}{} }; diff --git a/projects/rdc/include/rdc_lib/impl/RdcRasLib.h b/projects/rdc/include/rdc_lib/impl/RdcRasLib.h index 7440ed28f6..1308444302 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcRasLib.h +++ b/projects/rdc/include/rdc_lib/impl/RdcRasLib.h @@ -61,6 +61,9 @@ class RdcRasLib: public RdcTelemetry { uint32_t, rdc_field_value_f, void*); rdc_status_t (*fields_query_)(uint32_t[MAX_NUM_FIELDS], uint32_t*); + rdc_status_t (*fields_watch_)(rdc_gpu_field_t*, uint32_t); + rdc_status_t (*fields_unwatch_)(rdc_gpu_field_t*, uint32_t); + rdc_status_t (*rdc_module_init_)(uint64_t); rdc_status_t (*rdc_module_destroy_)(); }; diff --git a/projects/rdc/include/rdc_lib/impl/RdcTelemetryModule.h b/projects/rdc/include/rdc_lib/impl/RdcTelemetryModule.h index 3a67e5bda3..c0688cd5fc 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcTelemetryModule.h +++ b/projects/rdc/include/rdc_lib/impl/RdcTelemetryModule.h @@ -24,6 +24,7 @@ THE SOFTWARE. #include #include +#include #include #include "rdc_lib/RdcTelemetry.h" #include "rdc_lib/impl/RdcRasLib.h" @@ -49,7 +50,15 @@ class RdcTelemetryModule : public RdcTelemetry { RdcTelemetryModule(const RdcMetricFetcherPtr& fetcher, const RdcRasLibPtr& ras_module); + private: + //< Helper function to dispatch fields to module + void get_fields_for_module( + rdc_gpu_field_t* fields, + uint32_t fields_count, + std::map> + & fields_in_module, + std::vector& unsupport_fields); // NOLINT std::list telemetry_modules_; std::map fields_id_module_; }; diff --git a/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h b/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h index c2c2b3fde6..6d95a0d7d3 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h @@ -83,10 +83,8 @@ class RdcWatchTableImpl : public RdcWatchTable { //!< once per second. rdc_status_t rdc_field_update_all() override; - // TODO(bill_liu): Remove the RdcMetricFetcherPtr RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings, const RdcCacheManagerPtr& cache_mgr, - const RdcMetricFetcherPtr& metric_fetcher, const RdcModuleMgrPtr& module_mgr); private: @@ -116,7 +114,6 @@ class RdcWatchTableImpl : public RdcWatchTable { RdcGroupSettingsPtr group_settings_; RdcCacheManagerPtr cache_mgr_; - RdcMetricFetcherPtr metric_fetcher_; RdcModuleMgrPtr rdc_module_mgr_; //!< The watch table to store the watch settings. diff --git a/projects/rdc/python_binding/rdc_bootstrap.py b/projects/rdc/python_binding/rdc_bootstrap.py index 02d45a4dfb..6a8da28229 100644 --- a/projects/rdc/python_binding/rdc_bootstrap.py +++ b/projects/rdc/python_binding/rdc_bootstrap.py @@ -31,6 +31,11 @@ class rdc_status_t(Enum): RDC_ST_CLIENT_ERROR = 8 RDC_ST_ALREADY_EXIST = 9 RDC_ST_MAX_LIMIT = 10 + RDC_ST_INSUFF_RESOURCES = 11 + RDC_ST_FILE_ERROR = 12 + RDC_ST_NO_DATA = 13 + RDC_ST_PERM_ERROR = 14 + RDC_ST_UNKNOWN_ERROR = 4294967295 class rdc_operation_mode_t(c_int): RDC_OPERATION_MODE_AUTO = 0 @@ -62,6 +67,16 @@ class rdc_field_t(c_int): RDC_FI_GPU_MEMORY_TOTAL = 502 RDC_FI_ECC_CORRECT_TOTAL = 600 RDC_FI_ECC_UNCORRECT_TOTAL = 601 + RDC_EVNT_XGMI_0_NOP_TX = 1000 + RDC_EVNT_XGMI_0_REQ_TX = 1001 + RDC_EVNT_XGMI_0_RESP_TX = 1002 + RDC_EVNT_XGMI_0_BEATS_TX = 1003 + RDC_EVNT_XGMI_1_NOP_TX = 1004 + RDC_EVNT_XGMI_1_REQ_TX = 1005 + RDC_EVNT_XGMI_1_RESP_TX = 1006 + RDC_EVNT_XGMI_1_BEATS_TX = 1007 + RDC_EVNT_XGMI_0_THRPUT = 1500 + RDC_EVNT_XGMI_1_THRPUT = 1501 rdc_handle_t = c_void_p rdc_gpu_group_t = c_uint32 diff --git a/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc b/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc index 414378853b..202b2032aa 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc @@ -73,7 +73,7 @@ RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode): , metric_fetcher_(new RdcMetricFetcherImpl()) , rdc_module_mgr_(new RdcModuleMgrImpl(metric_fetcher_)) , watch_table_(new RdcWatchTableImpl(group_settings_, - cache_mgr_, metric_fetcher_, rdc_module_mgr_)) + cache_mgr_, rdc_module_mgr_)) , metrics_updater_(new RdcMetricsUpdaterImpl(watch_table_, METIC_UPDATE_FREQUENCY)) { if (mode == RDC_OPERATION_MODE_AUTO) { diff --git a/projects/rdc/rdc_libs/rdc/src/RdcRasLib.cc b/projects/rdc/rdc_libs/rdc/src/RdcRasLib.cc index b823333eb3..f08e56d44d 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcRasLib.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcRasLib.cc @@ -30,6 +30,8 @@ namespace rdc { RdcRasLib::RdcRasLib(const char* lib_name): fields_value_get_(nullptr) , fields_query_(nullptr) + , fields_watch_(nullptr) + , fields_unwatch_(nullptr) , rdc_module_init_(nullptr) , rdc_module_destroy_(nullptr) { rdc_status_t status = lib_loader_.load(lib_name); @@ -67,6 +69,16 @@ RdcRasLib::RdcRasLib(const char* lib_name): if (status != RDC_ST_OK) { fields_query_ = nullptr; } + status = lib_loader_.load_symbol(&fields_watch_, + "rdc_telemetry_fields_watch"); + if (status != RDC_ST_OK) { + fields_watch_ = nullptr; + } + status = lib_loader_.load_symbol(&fields_unwatch_, + "rdc_telemetry_fields_unwatch"); + if (status != RDC_ST_OK) { + fields_unwatch_ = nullptr; + } } RdcRasLib::~RdcRasLib() { @@ -85,8 +97,9 @@ rdc_status_t RdcRasLib::rdc_telemetry_fields_query( return RDC_ST_FAIL_LOAD_MODULE; } + auto status = fields_query_(field_ids, field_count); RDC_LOG(RDC_DEBUG, "RAS support " << *field_count << " fields"); - return fields_query_(field_ids, field_count); + return status; } rdc_status_t RdcRasLib::rdc_telemetry_fields_value_get( @@ -107,24 +120,30 @@ rdc_status_t RdcRasLib::rdc_telemetry_fields_value_get( rdc_status_t RdcRasLib::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, uint32_t fields_count) { - // TODO(bill_liu): Not Support yet if (fields == nullptr) { return RDC_ST_BAD_PARAMETER; } - (void)fields; - (void)fields_count; - return RDC_ST_NOT_SUPPORTED; + if (!fields_watch_) { + return RDC_ST_FAIL_LOAD_MODULE; + } + rdc_status_t status = fields_watch_(fields, fields_count); + RDC_LOG(RDC_DEBUG, "Watch " << fields_count << " fields from RAS: " + << rdc_status_string(status)); + return status; } rdc_status_t RdcRasLib::rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, uint32_t fields_count) { - // TODO(bill_liu): Not Support yet if (fields == nullptr) { return RDC_ST_BAD_PARAMETER; } - (void)fields; - (void)fields_count; - return RDC_ST_NOT_SUPPORTED; + if (!fields_unwatch_) { + return RDC_ST_FAIL_LOAD_MODULE; + } + rdc_status_t status = fields_unwatch_(fields, fields_count); + RDC_LOG(RDC_DEBUG, "Unwatch " << fields_count << " fields from RAS: " + << rdc_status_string(status)); + return status; } } // namespace rdc diff --git a/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc b/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc index 43b2dcbe6c..fdb378e5fd 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc @@ -74,24 +74,34 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, rdc_status_t RdcSmiLib::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, uint32_t fields_count) { - // TODO(bill_liu): Not Support yet if (fields == nullptr) { return RDC_ST_BAD_PARAMETER; } - (void)fields; - (void)fields_count; - return RDC_ST_NOT_SUPPORTED; + + for (uint32_t i = 0; i < fields_count; i++) { + metric_fetcher_->acquire_rsmi_handle( + {fields[i].gpu_index, fields[i].field_id}); + } + RDC_LOG(RDC_DEBUG, "acquire " << fields_count + << " field handles from rocm_smi_lib"); + + return RDC_ST_OK; } rdc_status_t RdcSmiLib::rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, uint32_t fields_count) { - // TODO(bill_liu): Not Support yet if (fields == nullptr) { return RDC_ST_BAD_PARAMETER; } - (void)fields; - (void)fields_count; - return RDC_ST_NOT_SUPPORTED; + + for (uint32_t i = 0; i < fields_count; i++) { + metric_fetcher_->delete_rsmi_handle( + {fields[i].gpu_index, fields[i].field_id}); + } + RDC_LOG(RDC_DEBUG, "delete " << fields_count + << " field handles from rocm_smi_lib"); + + return RDC_ST_OK; } rdc_status_t RdcSmiLib::rdc_telemetry_fields_query( diff --git a/projects/rdc/rdc_libs/rdc/src/RdcTelemetryModule.cc b/projects/rdc/rdc_libs/rdc/src/RdcTelemetryModule.cc index ddda670938..0f6aa7469b 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcTelemetryModule.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcTelemetryModule.cc @@ -35,10 +35,10 @@ rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_query( } auto ite = telemetry_modules_.begin(); *field_count = 0; - uint32_t count = 0; for (; ite != telemetry_modules_.end(); ite++) { + uint32_t count = 0; rdc_status_t status = (*ite)->rdc_telemetry_fields_query( - &(field_ids[count]), &count); + &(field_ids[*field_count]), &count); if (status == RDC_ST_OK) { *field_count += count; } @@ -52,11 +52,20 @@ rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_watch( if (fields == nullptr) { return RDC_ST_BAD_PARAMETER; } - auto ite = telemetry_modules_.begin(); - for (; ite != telemetry_modules_.end(); ite++) { - (*ite)->rdc_telemetry_fields_watch( - fields, fields_count); + + std::map> fields_in_module; + std::vector unsupport_fields; + get_fields_for_module(fields, fields_count, + fields_in_module, unsupport_fields); + + auto ite = fields_in_module.begin(); + for (; ite != fields_in_module.end(); ite++) { + if (ite->second.size() > 0) { + ite->first->rdc_telemetry_fields_watch( + &ite->second[0], ite->second.size()); + } } + return RDC_ST_OK; } @@ -66,11 +75,20 @@ rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_unwatch( if (fields == nullptr) { return RDC_ST_BAD_PARAMETER; } - auto ite = telemetry_modules_.begin(); - for (; ite != telemetry_modules_.end(); ite++) { - (*ite)->rdc_telemetry_fields_unwatch( - fields, fields_count); + + std::map> fields_in_module; + std::vector unsupport_fields; + get_fields_for_module(fields, fields_count, + fields_in_module, unsupport_fields); + + auto ite = fields_in_module.begin(); + for (; ite != fields_in_module.end(); ite++) { + if (ite->second.size() > 0) { + ite->first->rdc_telemetry_fields_unwatch( + &ite->second[0], ite->second.size()); + } } + return RDC_ST_OK; } @@ -86,14 +104,40 @@ RdcTelemetryModule::RdcTelemetryModule( auto ite = telemetry_modules_.begin(); for (; ite != telemetry_modules_.end(); ite++) { uint32_t field_ids[MAX_NUM_FIELDS]; - uint32_t field_count; - (*ite)->rdc_telemetry_fields_query(field_ids, &field_count); - for (uint32_t index = 0; index < field_count; index++) { - fields_id_module_.insert({field_ids[index], (*ite)}); + uint32_t field_count = 0; + rdc_status_t status = (*ite)-> + rdc_telemetry_fields_query(field_ids, &field_count); + if (status == RDC_ST_OK) { + for (uint32_t index = 0; index < field_count; index++) { + fields_id_module_.insert({field_ids[index], (*ite)}); + } } } } +void RdcTelemetryModule::get_fields_for_module( + rdc_gpu_field_t* fields, + uint32_t fields_count, + std::map>& + fields_in_module, + std::vector& unsupport_fields) { + for (uint32_t findex = 0; findex < fields_count; findex++) { + RdcTelemetryPtr module = fields_id_module_[fields[findex].field_id]; + if (module) { + fields_in_module[module].push_back(fields[findex]); + } else { + RDC_LOG(RDC_DEBUG, "Unsupported field " << + field_id_string(fields[findex].field_id)); + rdc_gpu_field_value_t value; + value.gpu_index = fields[findex].gpu_index; + value.field_value.field_id = fields[findex].field_id; + value.field_value.status = RDC_ST_NOT_SUPPORTED; + unsupport_fields.push_back(value); + } + } +} + + rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_value_get( rdc_gpu_field_t* fields, uint32_t fields_count, rdc_field_value_f callback, void* user_data) { @@ -104,20 +148,8 @@ rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_value_get( // Dispatch the fields to the libraries std::map> fields_to_fetch; std::vector unsupport_fields; - for (uint32_t findex = 0; findex < fields_count; findex++) { - RdcTelemetryPtr module = fields_id_module_[fields[findex].field_id]; - if (module) { - fields_to_fetch[module].push_back(fields[findex]); - } else { - RDC_LOG(RDC_DEBUG, "Unsupported field " << - field_id_string(fields[findex].field_id)); - rdc_gpu_field_value_t value; - value.gpu_index = fields[findex].gpu_index; - value.field_value.field_id = fields[findex].field_id; - value.field_value.status = RDC_ST_NOT_SUPPORTED; - unsupport_fields.push_back(value); - } - } + get_fields_for_module(fields, fields_count, + fields_to_fetch, unsupport_fields); auto ite = fields_to_fetch.begin(); for (; ite != fields_to_fetch.end(); ite ++) { diff --git a/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc index 1d2d82df9d..692dd5e93b 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc @@ -38,11 +38,9 @@ namespace rdc { RdcWatchTableImpl::RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings, const RdcCacheManagerPtr& cache_mgr, - const RdcMetricFetcherPtr& metric_fetcher, const RdcModuleMgrPtr& module_mgr): group_settings_(group_settings) , cache_mgr_(cache_mgr) - , metric_fetcher_(metric_fetcher) , rdc_module_mgr_(module_mgr) , last_cleanup_time_(0) { } @@ -211,15 +209,44 @@ rdc_status_t RdcWatchTableImpl::rdc_field_watch(rdc_gpu_group_t group_id, return result; } + // Skip not supported fields + uint32_t unsupported_fields = 0; + auto rdc_telemetry = rdc_module_mgr_->get_telemetry_module(); + if (rdc_telemetry) { + uint32_t field_ids[MAX_NUM_FIELDS]; + uint32_t field_count; + result = rdc_telemetry-> + rdc_telemetry_fields_query(field_ids, &field_count); + if (result == RDC_ST_OK) { + RDC_LOG(RDC_DEBUG, "The system support " + << field_count << " fields"); + for (auto it = fields_in_watch.begin(); + it != fields_in_watch.end(); ) { + bool not_supported = true; + for (uint32_t fi = 0; fi < field_count; fi++) { + if (field_ids[fi] == it->second) { + not_supported = false; + break; + } + } + if (not_supported) { + it = fields_in_watch.erase(it); + unsupported_fields++; + } else { + it++; + } + } // end for + } // end if + } + if ( unsupported_fields >0 ) { + RDC_LOG(RDC_DEBUG, "Skip watch " << unsupported_fields + <<" fields as they are not supported."); + } + // Update the fields_to_watch_ auto f_in_watch_iter = fields_in_watch.begin(); for (; f_in_watch_iter != fields_in_watch.end(); f_in_watch_iter++) { - // Skip not support fields - result = metric_fetcher_->acquire_rsmi_handle(*f_in_watch_iter); - if (result != RDC_ST_OK) { - continue; - } auto ite = fields_to_watch_.find(*f_in_watch_iter); if (ite == fields_to_watch_.end()) { // A new field fields_to_watch_.insert({*f_in_watch_iter, f}); @@ -242,6 +269,21 @@ rdc_status_t RdcWatchTableImpl::rdc_field_watch(rdc_gpu_group_t group_id, // Add to the watch table watch_table_.insert({gkey, f}); + // Notify the telemetry_module to watch those fields + if (rdc_telemetry) { + std::vector fields; + auto fields_to_watch_iter = fields_to_watch_.begin(); + for (; fields_to_watch_iter != fields_to_watch_.end(); + fields_to_watch_iter++) { + if (fields_to_watch_iter->second.is_watching) { + fields.push_back({fields_to_watch_iter->first.first, + fields_to_watch_iter->first.second}); + } + } + rdc_telemetry->rdc_telemetry_fields_watch(&fields[0], + fields.size()); + } + return RDC_ST_OK; } @@ -293,27 +335,28 @@ rdc_status_t RdcWatchTableImpl::update_field_in_table_when_unwatch( // Update the fields that impacted by this unwatch auto fite = fields.begin(); + std::vector unwatch_fields; for (; fite != fields.end(); fite++) { auto f_in_table = fields_to_watch_.find((*fite)); if (f_in_table == fields_to_watch_.end()) { // Not in fields_to_watch_ + unwatch_fields.push_back({fite->first, fite->second}); continue; } auto freq_iter = update_frequencies.find(*fite); if (freq_iter == update_frequencies.end()) { f_in_table->second.is_watching = false; + unwatch_fields.push_back({fite->first, fite->second}); } else { f_in_table->second.update_freq = freq_iter->second; } } - fite = fields.begin(); - for (; fite != fields.end(); fite++) { - result = metric_fetcher_->delete_rsmi_handle(*fite); - - if (result != RDC_ST_OK && result != RDC_ST_NOT_SUPPORTED) { - return result; - } + // Notify the telemetry_module to unwatch those fields + auto rdc_telemetry = rdc_module_mgr_->get_telemetry_module(); + if (rdc_telemetry) { + rdc_telemetry->rdc_telemetry_fields_unwatch(&unwatch_fields[0], + unwatch_fields.size()); } return RDC_ST_OK;