Support watch() and unwatch() in RDC module framework
The framework now supports watch() and unwatch(), which can be used
by the telemetry library to init events or pre-fetch fields when recording
starts.
* A new header file RdcTelemetryLibInterface.h is defined for library to
include it.
* The RdcWatchTable will not talk to RdcMetricFetcher directly anymore.
It will call the framework watch/unwatch to dispatch it to the libraries.
* Make the python binding consistent with the current code.
Change-Id: Ie5731d920ed5928f901369d60c23bd450807a562
[ROCm/rdc commit: 151520b97e]
Cette révision appartient à :
@@ -27,7 +27,11 @@ THE SOFTWARE.
|
||||
extern "C" {
|
||||
#endif // __cplusplus
|
||||
|
||||
#ifdef __cplusplus
|
||||
#include <cstdint>
|
||||
#else
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
|
||||
/** \file rdc.h
|
||||
* Main header file for the ROCm RDC library.
|
||||
|
||||
@@ -24,26 +24,11 @@ THE SOFTWARE.
|
||||
|
||||
#include <memory>
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc_lib/RdcTelemetryLibInterface.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
// Structure to keep both gup index and field value
|
||||
typedef struct {
|
||||
uint32_t gpu_index;
|
||||
rdc_field_value field_value;
|
||||
} rdc_gpu_field_value_t;
|
||||
|
||||
|
||||
typedef struct {
|
||||
uint32_t gpu_index;
|
||||
rdc_field_t field_id;
|
||||
} rdc_gpu_field_t;
|
||||
|
||||
#define MAX_NUM_FIELDS 8192
|
||||
typedef rdc_status_t(*rdc_field_value_f)(rdc_gpu_field_value_t* values,
|
||||
uint32_t num_values, void* user_data);
|
||||
|
||||
class RdcTelemetry {
|
||||
public:
|
||||
// get support field ids
|
||||
|
||||
@@ -0,0 +1,68 @@
|
||||
/*
|
||||
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#ifndef RDC_LIB_RDCTELEMETRYLIBINTERFACE_H_
|
||||
#define RDC_LIB_RDCTELEMETRYLIBINTERFACE_H_
|
||||
|
||||
// The telemetry interface for libraries, for example, RAS.
|
||||
#include <rdc/rdc.h>
|
||||
#include <cstdint>
|
||||
|
||||
extern "C" {
|
||||
|
||||
// Structure to keep both gup index and field value
|
||||
typedef struct {
|
||||
uint32_t gpu_index;
|
||||
rdc_field_value field_value;
|
||||
} rdc_gpu_field_value_t;
|
||||
|
||||
typedef struct {
|
||||
uint32_t gpu_index;
|
||||
rdc_field_t field_id;
|
||||
} rdc_gpu_field_t;
|
||||
|
||||
#define MAX_NUM_FIELDS 8192
|
||||
typedef rdc_status_t(*rdc_field_value_f)(rdc_gpu_field_value_t* values,
|
||||
uint32_t num_values, void* user_data);
|
||||
|
||||
// The library will implement below function
|
||||
|
||||
rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS],
|
||||
uint32_t* field_count);
|
||||
|
||||
rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count, rdc_field_value_f callback,
|
||||
void* user_data);
|
||||
|
||||
|
||||
rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count);
|
||||
|
||||
rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count);
|
||||
|
||||
rdc_status_t rdc_module_init(uint64_t flags);
|
||||
|
||||
rdc_status_t rdc_module_destroy();
|
||||
|
||||
}
|
||||
|
||||
#endif // RDC_LIB_RDCTELEMETRYLIBINTERFACE_H_
|
||||
@@ -24,6 +24,7 @@ THE SOFTWARE.
|
||||
|
||||
#include <mutex> // NOLINT(build/c++11)
|
||||
#include <future> // NOLINT(build/c++11)
|
||||
#include <memory>
|
||||
#include <condition_variable> // NOLINT(build/c++11)
|
||||
#include <map>
|
||||
#include <queue>
|
||||
@@ -54,7 +55,7 @@ struct FieldRSMIData {
|
||||
union {
|
||||
rsmi_counter_value_t counter_val;
|
||||
};
|
||||
~FieldRSMIData(){}
|
||||
~FieldRSMIData() {}
|
||||
FieldRSMIData() : evt_handle(0), counter_val{0, 0, 0}{}
|
||||
};
|
||||
|
||||
|
||||
@@ -61,6 +61,9 @@ class RdcRasLib: public RdcTelemetry {
|
||||
uint32_t, rdc_field_value_f, void*);
|
||||
rdc_status_t (*fields_query_)(uint32_t[MAX_NUM_FIELDS], uint32_t*);
|
||||
|
||||
rdc_status_t (*fields_watch_)(rdc_gpu_field_t*, uint32_t);
|
||||
rdc_status_t (*fields_unwatch_)(rdc_gpu_field_t*, uint32_t);
|
||||
|
||||
rdc_status_t (*rdc_module_init_)(uint64_t);
|
||||
rdc_status_t (*rdc_module_destroy_)();
|
||||
};
|
||||
|
||||
@@ -24,6 +24,7 @@ THE SOFTWARE.
|
||||
|
||||
#include <map>
|
||||
#include <list>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include "rdc_lib/RdcTelemetry.h"
|
||||
#include "rdc_lib/impl/RdcRasLib.h"
|
||||
@@ -49,7 +50,15 @@ class RdcTelemetryModule : public RdcTelemetry {
|
||||
|
||||
RdcTelemetryModule(const RdcMetricFetcherPtr& fetcher,
|
||||
const RdcRasLibPtr& ras_module);
|
||||
|
||||
private:
|
||||
//< Helper function to dispatch fields to module
|
||||
void get_fields_for_module(
|
||||
rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count,
|
||||
std::map<RdcTelemetryPtr, std::vector<rdc_gpu_field_t>>
|
||||
& fields_in_module,
|
||||
std::vector<rdc_gpu_field_value_t>& unsupport_fields); // NOLINT
|
||||
std::list<RdcTelemetryPtr> telemetry_modules_;
|
||||
std::map<uint32_t, RdcTelemetryPtr> fields_id_module_;
|
||||
};
|
||||
|
||||
@@ -83,10 +83,8 @@ class RdcWatchTableImpl : public RdcWatchTable {
|
||||
//!< once per second.
|
||||
rdc_status_t rdc_field_update_all() override;
|
||||
|
||||
// TODO(bill_liu): Remove the RdcMetricFetcherPtr
|
||||
RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings,
|
||||
const RdcCacheManagerPtr& cache_mgr,
|
||||
const RdcMetricFetcherPtr& metric_fetcher,
|
||||
const RdcModuleMgrPtr& module_mgr);
|
||||
|
||||
private:
|
||||
@@ -116,7 +114,6 @@ class RdcWatchTableImpl : public RdcWatchTable {
|
||||
|
||||
RdcGroupSettingsPtr group_settings_;
|
||||
RdcCacheManagerPtr cache_mgr_;
|
||||
RdcMetricFetcherPtr metric_fetcher_;
|
||||
RdcModuleMgrPtr rdc_module_mgr_;
|
||||
|
||||
//!< The watch table to store the watch settings.
|
||||
|
||||
@@ -31,6 +31,11 @@ class rdc_status_t(Enum):
|
||||
RDC_ST_CLIENT_ERROR = 8
|
||||
RDC_ST_ALREADY_EXIST = 9
|
||||
RDC_ST_MAX_LIMIT = 10
|
||||
RDC_ST_INSUFF_RESOURCES = 11
|
||||
RDC_ST_FILE_ERROR = 12
|
||||
RDC_ST_NO_DATA = 13
|
||||
RDC_ST_PERM_ERROR = 14
|
||||
RDC_ST_UNKNOWN_ERROR = 4294967295
|
||||
|
||||
class rdc_operation_mode_t(c_int):
|
||||
RDC_OPERATION_MODE_AUTO = 0
|
||||
@@ -62,6 +67,16 @@ class rdc_field_t(c_int):
|
||||
RDC_FI_GPU_MEMORY_TOTAL = 502
|
||||
RDC_FI_ECC_CORRECT_TOTAL = 600
|
||||
RDC_FI_ECC_UNCORRECT_TOTAL = 601
|
||||
RDC_EVNT_XGMI_0_NOP_TX = 1000
|
||||
RDC_EVNT_XGMI_0_REQ_TX = 1001
|
||||
RDC_EVNT_XGMI_0_RESP_TX = 1002
|
||||
RDC_EVNT_XGMI_0_BEATS_TX = 1003
|
||||
RDC_EVNT_XGMI_1_NOP_TX = 1004
|
||||
RDC_EVNT_XGMI_1_REQ_TX = 1005
|
||||
RDC_EVNT_XGMI_1_RESP_TX = 1006
|
||||
RDC_EVNT_XGMI_1_BEATS_TX = 1007
|
||||
RDC_EVNT_XGMI_0_THRPUT = 1500
|
||||
RDC_EVNT_XGMI_1_THRPUT = 1501
|
||||
|
||||
rdc_handle_t = c_void_p
|
||||
rdc_gpu_group_t = c_uint32
|
||||
|
||||
@@ -73,7 +73,7 @@ RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode):
|
||||
, metric_fetcher_(new RdcMetricFetcherImpl())
|
||||
, rdc_module_mgr_(new RdcModuleMgrImpl(metric_fetcher_))
|
||||
, watch_table_(new RdcWatchTableImpl(group_settings_,
|
||||
cache_mgr_, metric_fetcher_, rdc_module_mgr_))
|
||||
cache_mgr_, rdc_module_mgr_))
|
||||
, metrics_updater_(new RdcMetricsUpdaterImpl(watch_table_,
|
||||
METIC_UPDATE_FREQUENCY)) {
|
||||
if (mode == RDC_OPERATION_MODE_AUTO) {
|
||||
|
||||
@@ -30,6 +30,8 @@ namespace rdc {
|
||||
RdcRasLib::RdcRasLib(const char* lib_name):
|
||||
fields_value_get_(nullptr)
|
||||
, fields_query_(nullptr)
|
||||
, fields_watch_(nullptr)
|
||||
, fields_unwatch_(nullptr)
|
||||
, rdc_module_init_(nullptr)
|
||||
, rdc_module_destroy_(nullptr) {
|
||||
rdc_status_t status = lib_loader_.load(lib_name);
|
||||
@@ -67,6 +69,16 @@ RdcRasLib::RdcRasLib(const char* lib_name):
|
||||
if (status != RDC_ST_OK) {
|
||||
fields_query_ = nullptr;
|
||||
}
|
||||
status = lib_loader_.load_symbol(&fields_watch_,
|
||||
"rdc_telemetry_fields_watch");
|
||||
if (status != RDC_ST_OK) {
|
||||
fields_watch_ = nullptr;
|
||||
}
|
||||
status = lib_loader_.load_symbol(&fields_unwatch_,
|
||||
"rdc_telemetry_fields_unwatch");
|
||||
if (status != RDC_ST_OK) {
|
||||
fields_unwatch_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
RdcRasLib::~RdcRasLib() {
|
||||
@@ -85,8 +97,9 @@ rdc_status_t RdcRasLib::rdc_telemetry_fields_query(
|
||||
return RDC_ST_FAIL_LOAD_MODULE;
|
||||
}
|
||||
|
||||
auto status = fields_query_(field_ids, field_count);
|
||||
RDC_LOG(RDC_DEBUG, "RAS support " << *field_count << " fields");
|
||||
return fields_query_(field_ids, field_count);
|
||||
return status;
|
||||
}
|
||||
|
||||
rdc_status_t RdcRasLib::rdc_telemetry_fields_value_get(
|
||||
@@ -107,24 +120,30 @@ rdc_status_t RdcRasLib::rdc_telemetry_fields_value_get(
|
||||
|
||||
rdc_status_t RdcRasLib::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count) {
|
||||
// TODO(bill_liu): Not Support yet
|
||||
if (fields == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
(void)fields;
|
||||
(void)fields_count;
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
if (!fields_watch_) {
|
||||
return RDC_ST_FAIL_LOAD_MODULE;
|
||||
}
|
||||
rdc_status_t status = fields_watch_(fields, fields_count);
|
||||
RDC_LOG(RDC_DEBUG, "Watch " << fields_count << " fields from RAS: "
|
||||
<< rdc_status_string(status));
|
||||
return status;
|
||||
}
|
||||
|
||||
rdc_status_t RdcRasLib::rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count) {
|
||||
// TODO(bill_liu): Not Support yet
|
||||
if (fields == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
(void)fields;
|
||||
(void)fields_count;
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
if (!fields_unwatch_) {
|
||||
return RDC_ST_FAIL_LOAD_MODULE;
|
||||
}
|
||||
rdc_status_t status = fields_unwatch_(fields, fields_count);
|
||||
RDC_LOG(RDC_DEBUG, "Unwatch " << fields_count << " fields from RAS: "
|
||||
<< rdc_status_string(status));
|
||||
return status;
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
|
||||
@@ -74,24 +74,34 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields,
|
||||
|
||||
rdc_status_t RdcSmiLib::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count) {
|
||||
// TODO(bill_liu): Not Support yet
|
||||
if (fields == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
(void)fields;
|
||||
(void)fields_count;
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
|
||||
for (uint32_t i = 0; i < fields_count; i++) {
|
||||
metric_fetcher_->acquire_rsmi_handle(
|
||||
{fields[i].gpu_index, fields[i].field_id});
|
||||
}
|
||||
RDC_LOG(RDC_DEBUG, "acquire " << fields_count
|
||||
<< " field handles from rocm_smi_lib");
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcSmiLib::rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count) {
|
||||
// TODO(bill_liu): Not Support yet
|
||||
if (fields == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
(void)fields;
|
||||
(void)fields_count;
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
|
||||
for (uint32_t i = 0; i < fields_count; i++) {
|
||||
metric_fetcher_->delete_rsmi_handle(
|
||||
{fields[i].gpu_index, fields[i].field_id});
|
||||
}
|
||||
RDC_LOG(RDC_DEBUG, "delete " << fields_count
|
||||
<< " field handles from rocm_smi_lib");
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(
|
||||
|
||||
@@ -35,10 +35,10 @@ rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_query(
|
||||
}
|
||||
auto ite = telemetry_modules_.begin();
|
||||
*field_count = 0;
|
||||
uint32_t count = 0;
|
||||
for (; ite != telemetry_modules_.end(); ite++) {
|
||||
uint32_t count = 0;
|
||||
rdc_status_t status = (*ite)->rdc_telemetry_fields_query(
|
||||
&(field_ids[count]), &count);
|
||||
&(field_ids[*field_count]), &count);
|
||||
if (status == RDC_ST_OK) {
|
||||
*field_count += count;
|
||||
}
|
||||
@@ -52,11 +52,20 @@ rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_watch(
|
||||
if (fields == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
auto ite = telemetry_modules_.begin();
|
||||
for (; ite != telemetry_modules_.end(); ite++) {
|
||||
(*ite)->rdc_telemetry_fields_watch(
|
||||
fields, fields_count);
|
||||
|
||||
std::map<RdcTelemetryPtr, std::vector<rdc_gpu_field_t>> fields_in_module;
|
||||
std::vector<rdc_gpu_field_value_t> unsupport_fields;
|
||||
get_fields_for_module(fields, fields_count,
|
||||
fields_in_module, unsupport_fields);
|
||||
|
||||
auto ite = fields_in_module.begin();
|
||||
for (; ite != fields_in_module.end(); ite++) {
|
||||
if (ite->second.size() > 0) {
|
||||
ite->first->rdc_telemetry_fields_watch(
|
||||
&ite->second[0], ite->second.size());
|
||||
}
|
||||
}
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
@@ -66,11 +75,20 @@ rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_unwatch(
|
||||
if (fields == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
auto ite = telemetry_modules_.begin();
|
||||
for (; ite != telemetry_modules_.end(); ite++) {
|
||||
(*ite)->rdc_telemetry_fields_unwatch(
|
||||
fields, fields_count);
|
||||
|
||||
std::map<RdcTelemetryPtr, std::vector<rdc_gpu_field_t>> fields_in_module;
|
||||
std::vector<rdc_gpu_field_value_t> unsupport_fields;
|
||||
get_fields_for_module(fields, fields_count,
|
||||
fields_in_module, unsupport_fields);
|
||||
|
||||
auto ite = fields_in_module.begin();
|
||||
for (; ite != fields_in_module.end(); ite++) {
|
||||
if (ite->second.size() > 0) {
|
||||
ite->first->rdc_telemetry_fields_unwatch(
|
||||
&ite->second[0], ite->second.size());
|
||||
}
|
||||
}
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
@@ -86,14 +104,40 @@ RdcTelemetryModule::RdcTelemetryModule(
|
||||
auto ite = telemetry_modules_.begin();
|
||||
for (; ite != telemetry_modules_.end(); ite++) {
|
||||
uint32_t field_ids[MAX_NUM_FIELDS];
|
||||
uint32_t field_count;
|
||||
(*ite)->rdc_telemetry_fields_query(field_ids, &field_count);
|
||||
for (uint32_t index = 0; index < field_count; index++) {
|
||||
fields_id_module_.insert({field_ids[index], (*ite)});
|
||||
uint32_t field_count = 0;
|
||||
rdc_status_t status = (*ite)->
|
||||
rdc_telemetry_fields_query(field_ids, &field_count);
|
||||
if (status == RDC_ST_OK) {
|
||||
for (uint32_t index = 0; index < field_count; index++) {
|
||||
fields_id_module_.insert({field_ids[index], (*ite)});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void RdcTelemetryModule::get_fields_for_module(
|
||||
rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count,
|
||||
std::map<RdcTelemetryPtr, std::vector<rdc_gpu_field_t>>&
|
||||
fields_in_module,
|
||||
std::vector<rdc_gpu_field_value_t>& unsupport_fields) {
|
||||
for (uint32_t findex = 0; findex < fields_count; findex++) {
|
||||
RdcTelemetryPtr module = fields_id_module_[fields[findex].field_id];
|
||||
if (module) {
|
||||
fields_in_module[module].push_back(fields[findex]);
|
||||
} else {
|
||||
RDC_LOG(RDC_DEBUG, "Unsupported field " <<
|
||||
field_id_string(fields[findex].field_id));
|
||||
rdc_gpu_field_value_t value;
|
||||
value.gpu_index = fields[findex].gpu_index;
|
||||
value.field_value.field_id = fields[findex].field_id;
|
||||
value.field_value.status = RDC_ST_NOT_SUPPORTED;
|
||||
unsupport_fields.push_back(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_value_get(
|
||||
rdc_gpu_field_t* fields, uint32_t fields_count,
|
||||
rdc_field_value_f callback, void* user_data) {
|
||||
@@ -104,20 +148,8 @@ rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_value_get(
|
||||
// Dispatch the fields to the libraries
|
||||
std::map<RdcTelemetryPtr, std::vector<rdc_gpu_field_t>> fields_to_fetch;
|
||||
std::vector<rdc_gpu_field_value_t> unsupport_fields;
|
||||
for (uint32_t findex = 0; findex < fields_count; findex++) {
|
||||
RdcTelemetryPtr module = fields_id_module_[fields[findex].field_id];
|
||||
if (module) {
|
||||
fields_to_fetch[module].push_back(fields[findex]);
|
||||
} else {
|
||||
RDC_LOG(RDC_DEBUG, "Unsupported field " <<
|
||||
field_id_string(fields[findex].field_id));
|
||||
rdc_gpu_field_value_t value;
|
||||
value.gpu_index = fields[findex].gpu_index;
|
||||
value.field_value.field_id = fields[findex].field_id;
|
||||
value.field_value.status = RDC_ST_NOT_SUPPORTED;
|
||||
unsupport_fields.push_back(value);
|
||||
}
|
||||
}
|
||||
get_fields_for_module(fields, fields_count,
|
||||
fields_to_fetch, unsupport_fields);
|
||||
|
||||
auto ite = fields_to_fetch.begin();
|
||||
for (; ite != fields_to_fetch.end(); ite ++) {
|
||||
|
||||
@@ -38,11 +38,9 @@ namespace rdc {
|
||||
|
||||
RdcWatchTableImpl::RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings,
|
||||
const RdcCacheManagerPtr& cache_mgr,
|
||||
const RdcMetricFetcherPtr& metric_fetcher,
|
||||
const RdcModuleMgrPtr& module_mgr):
|
||||
group_settings_(group_settings)
|
||||
, cache_mgr_(cache_mgr)
|
||||
, metric_fetcher_(metric_fetcher)
|
||||
, rdc_module_mgr_(module_mgr)
|
||||
, last_cleanup_time_(0) {
|
||||
}
|
||||
@@ -211,15 +209,44 @@ rdc_status_t RdcWatchTableImpl::rdc_field_watch(rdc_gpu_group_t group_id,
|
||||
return result;
|
||||
}
|
||||
|
||||
// Skip not supported fields
|
||||
uint32_t unsupported_fields = 0;
|
||||
auto rdc_telemetry = rdc_module_mgr_->get_telemetry_module();
|
||||
if (rdc_telemetry) {
|
||||
uint32_t field_ids[MAX_NUM_FIELDS];
|
||||
uint32_t field_count;
|
||||
result = rdc_telemetry->
|
||||
rdc_telemetry_fields_query(field_ids, &field_count);
|
||||
if (result == RDC_ST_OK) {
|
||||
RDC_LOG(RDC_DEBUG, "The system support "
|
||||
<< field_count << " fields");
|
||||
for (auto it = fields_in_watch.begin();
|
||||
it != fields_in_watch.end(); ) {
|
||||
bool not_supported = true;
|
||||
for (uint32_t fi = 0; fi < field_count; fi++) {
|
||||
if (field_ids[fi] == it->second) {
|
||||
not_supported = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (not_supported) {
|
||||
it = fields_in_watch.erase(it);
|
||||
unsupported_fields++;
|
||||
} else {
|
||||
it++;
|
||||
}
|
||||
} // end for
|
||||
} // end if
|
||||
}
|
||||
if ( unsupported_fields >0 ) {
|
||||
RDC_LOG(RDC_DEBUG, "Skip watch " << unsupported_fields
|
||||
<<" fields as they are not supported.");
|
||||
}
|
||||
|
||||
// Update the fields_to_watch_
|
||||
auto f_in_watch_iter = fields_in_watch.begin();
|
||||
|
||||
for (; f_in_watch_iter != fields_in_watch.end(); f_in_watch_iter++) {
|
||||
// Skip not support fields
|
||||
result = metric_fetcher_->acquire_rsmi_handle(*f_in_watch_iter);
|
||||
if (result != RDC_ST_OK) {
|
||||
continue;
|
||||
}
|
||||
auto ite = fields_to_watch_.find(*f_in_watch_iter);
|
||||
if (ite == fields_to_watch_.end()) { // A new field
|
||||
fields_to_watch_.insert({*f_in_watch_iter, f});
|
||||
@@ -242,6 +269,21 @@ rdc_status_t RdcWatchTableImpl::rdc_field_watch(rdc_gpu_group_t group_id,
|
||||
// Add to the watch table
|
||||
watch_table_.insert({gkey, f});
|
||||
|
||||
// Notify the telemetry_module to watch those fields
|
||||
if (rdc_telemetry) {
|
||||
std::vector<rdc_gpu_field_t> fields;
|
||||
auto fields_to_watch_iter = fields_to_watch_.begin();
|
||||
for (; fields_to_watch_iter != fields_to_watch_.end();
|
||||
fields_to_watch_iter++) {
|
||||
if (fields_to_watch_iter->second.is_watching) {
|
||||
fields.push_back({fields_to_watch_iter->first.first,
|
||||
fields_to_watch_iter->first.second});
|
||||
}
|
||||
}
|
||||
rdc_telemetry->rdc_telemetry_fields_watch(&fields[0],
|
||||
fields.size());
|
||||
}
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
@@ -293,27 +335,28 @@ rdc_status_t RdcWatchTableImpl::update_field_in_table_when_unwatch(
|
||||
|
||||
// Update the fields that impacted by this unwatch
|
||||
auto fite = fields.begin();
|
||||
std::vector<rdc_gpu_field_t> unwatch_fields;
|
||||
for (; fite != fields.end(); fite++) {
|
||||
auto f_in_table = fields_to_watch_.find((*fite));
|
||||
if (f_in_table == fields_to_watch_.end()) { // Not in fields_to_watch_
|
||||
unwatch_fields.push_back({fite->first, fite->second});
|
||||
continue;
|
||||
}
|
||||
|
||||
auto freq_iter = update_frequencies.find(*fite);
|
||||
if (freq_iter == update_frequencies.end()) {
|
||||
f_in_table->second.is_watching = false;
|
||||
unwatch_fields.push_back({fite->first, fite->second});
|
||||
} else {
|
||||
f_in_table->second.update_freq = freq_iter->second;
|
||||
}
|
||||
}
|
||||
|
||||
fite = fields.begin();
|
||||
for (; fite != fields.end(); fite++) {
|
||||
result = metric_fetcher_->delete_rsmi_handle(*fite);
|
||||
|
||||
if (result != RDC_ST_OK && result != RDC_ST_NOT_SUPPORTED) {
|
||||
return result;
|
||||
}
|
||||
// Notify the telemetry_module to unwatch those fields
|
||||
auto rdc_telemetry = rdc_module_mgr_->get_telemetry_module();
|
||||
if (rdc_telemetry) {
|
||||
rdc_telemetry->rdc_telemetry_fields_unwatch(&unwatch_fields[0],
|
||||
unwatch_fields.size());
|
||||
}
|
||||
|
||||
return RDC_ST_OK;
|
||||
|
||||
Référencer dans un nouveau ticket
Bloquer un utilisateur