Support watch() and unwatch() in RDC module framework

The framework now supports watch() and unwatch(), which can be used
by the telemetry library to init events or pre-fetch fields when recording
starts.
* A new header file RdcTelemetryLibInterface.h is defined for library to
  include it.
* The RdcWatchTable will not talk to RdcMetricFetcher directly anymore.
  It will call the framework watch/unwatch to dispatch it to the libraries.
* Make the python binding consistent with the current code.

Change-Id: Ie5731d920ed5928f901369d60c23bd450807a562


[ROCm/rdc commit: 151520b97e]
Cette révision appartient à :
Bill(Shuzhou) Liu
2020-09-15 14:53:04 -04:00
Parent fbd87a499f
révision 753d5fed6d
13 fichiers modifiés avec 266 ajouts et 80 suppressions
+4
Voir le fichier
@@ -27,7 +27,11 @@ THE SOFTWARE.
extern "C" {
#endif // __cplusplus
#ifdef __cplusplus
#include <cstdint>
#else
#include <stdint.h>
#endif
/** \file rdc.h
* Main header file for the ROCm RDC library.
+1 -16
Voir le fichier
@@ -24,26 +24,11 @@ THE SOFTWARE.
#include <memory>
#include "rdc/rdc.h"
#include "rdc_lib/RdcTelemetryLibInterface.h"
namespace amd {
namespace rdc {
// Structure to keep both gup index and field value
typedef struct {
uint32_t gpu_index;
rdc_field_value field_value;
} rdc_gpu_field_value_t;
typedef struct {
uint32_t gpu_index;
rdc_field_t field_id;
} rdc_gpu_field_t;
#define MAX_NUM_FIELDS 8192
typedef rdc_status_t(*rdc_field_value_f)(rdc_gpu_field_value_t* values,
uint32_t num_values, void* user_data);
class RdcTelemetry {
public:
// get support field ids
+68
Voir le fichier
@@ -0,0 +1,68 @@
/*
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef RDC_LIB_RDCTELEMETRYLIBINTERFACE_H_
#define RDC_LIB_RDCTELEMETRYLIBINTERFACE_H_
// The telemetry interface for libraries, for example, RAS.
#include <rdc/rdc.h>
#include <cstdint>
extern "C" {
// Structure to keep both gup index and field value
typedef struct {
uint32_t gpu_index;
rdc_field_value field_value;
} rdc_gpu_field_value_t;
typedef struct {
uint32_t gpu_index;
rdc_field_t field_id;
} rdc_gpu_field_t;
#define MAX_NUM_FIELDS 8192
typedef rdc_status_t(*rdc_field_value_f)(rdc_gpu_field_value_t* values,
uint32_t num_values, void* user_data);
// The library will implement below function
rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS],
uint32_t* field_count);
rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields,
uint32_t fields_count, rdc_field_value_f callback,
void* user_data);
rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields,
uint32_t fields_count);
rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields,
uint32_t fields_count);
rdc_status_t rdc_module_init(uint64_t flags);
rdc_status_t rdc_module_destroy();
}
#endif // RDC_LIB_RDCTELEMETRYLIBINTERFACE_H_
+2 -1
Voir le fichier
@@ -24,6 +24,7 @@ THE SOFTWARE.
#include <mutex> // NOLINT(build/c++11)
#include <future> // NOLINT(build/c++11)
#include <memory>
#include <condition_variable> // NOLINT(build/c++11)
#include <map>
#include <queue>
@@ -54,7 +55,7 @@ struct FieldRSMIData {
union {
rsmi_counter_value_t counter_val;
};
~FieldRSMIData(){}
~FieldRSMIData() {}
FieldRSMIData() : evt_handle(0), counter_val{0, 0, 0}{}
};
+3
Voir le fichier
@@ -61,6 +61,9 @@ class RdcRasLib: public RdcTelemetry {
uint32_t, rdc_field_value_f, void*);
rdc_status_t (*fields_query_)(uint32_t[MAX_NUM_FIELDS], uint32_t*);
rdc_status_t (*fields_watch_)(rdc_gpu_field_t*, uint32_t);
rdc_status_t (*fields_unwatch_)(rdc_gpu_field_t*, uint32_t);
rdc_status_t (*rdc_module_init_)(uint64_t);
rdc_status_t (*rdc_module_destroy_)();
};
+9
Voir le fichier
@@ -24,6 +24,7 @@ THE SOFTWARE.
#include <map>
#include <list>
#include <vector>
#include <memory>
#include "rdc_lib/RdcTelemetry.h"
#include "rdc_lib/impl/RdcRasLib.h"
@@ -49,7 +50,15 @@ class RdcTelemetryModule : public RdcTelemetry {
RdcTelemetryModule(const RdcMetricFetcherPtr& fetcher,
const RdcRasLibPtr& ras_module);
private:
//< Helper function to dispatch fields to module
void get_fields_for_module(
rdc_gpu_field_t* fields,
uint32_t fields_count,
std::map<RdcTelemetryPtr, std::vector<rdc_gpu_field_t>>
& fields_in_module,
std::vector<rdc_gpu_field_value_t>& unsupport_fields); // NOLINT
std::list<RdcTelemetryPtr> telemetry_modules_;
std::map<uint32_t, RdcTelemetryPtr> fields_id_module_;
};
-3
Voir le fichier
@@ -83,10 +83,8 @@ class RdcWatchTableImpl : public RdcWatchTable {
//!< once per second.
rdc_status_t rdc_field_update_all() override;
// TODO(bill_liu): Remove the RdcMetricFetcherPtr
RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings,
const RdcCacheManagerPtr& cache_mgr,
const RdcMetricFetcherPtr& metric_fetcher,
const RdcModuleMgrPtr& module_mgr);
private:
@@ -116,7 +114,6 @@ class RdcWatchTableImpl : public RdcWatchTable {
RdcGroupSettingsPtr group_settings_;
RdcCacheManagerPtr cache_mgr_;
RdcMetricFetcherPtr metric_fetcher_;
RdcModuleMgrPtr rdc_module_mgr_;
//!< The watch table to store the watch settings.
+15
Voir le fichier
@@ -31,6 +31,11 @@ class rdc_status_t(Enum):
RDC_ST_CLIENT_ERROR = 8
RDC_ST_ALREADY_EXIST = 9
RDC_ST_MAX_LIMIT = 10
RDC_ST_INSUFF_RESOURCES = 11
RDC_ST_FILE_ERROR = 12
RDC_ST_NO_DATA = 13
RDC_ST_PERM_ERROR = 14
RDC_ST_UNKNOWN_ERROR = 4294967295
class rdc_operation_mode_t(c_int):
RDC_OPERATION_MODE_AUTO = 0
@@ -62,6 +67,16 @@ class rdc_field_t(c_int):
RDC_FI_GPU_MEMORY_TOTAL = 502
RDC_FI_ECC_CORRECT_TOTAL = 600
RDC_FI_ECC_UNCORRECT_TOTAL = 601
RDC_EVNT_XGMI_0_NOP_TX = 1000
RDC_EVNT_XGMI_0_REQ_TX = 1001
RDC_EVNT_XGMI_0_RESP_TX = 1002
RDC_EVNT_XGMI_0_BEATS_TX = 1003
RDC_EVNT_XGMI_1_NOP_TX = 1004
RDC_EVNT_XGMI_1_REQ_TX = 1005
RDC_EVNT_XGMI_1_RESP_TX = 1006
RDC_EVNT_XGMI_1_BEATS_TX = 1007
RDC_EVNT_XGMI_0_THRPUT = 1500
RDC_EVNT_XGMI_1_THRPUT = 1501
rdc_handle_t = c_void_p
rdc_gpu_group_t = c_uint32
+1 -1
Voir le fichier
@@ -73,7 +73,7 @@ RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode):
, metric_fetcher_(new RdcMetricFetcherImpl())
, rdc_module_mgr_(new RdcModuleMgrImpl(metric_fetcher_))
, watch_table_(new RdcWatchTableImpl(group_settings_,
cache_mgr_, metric_fetcher_, rdc_module_mgr_))
cache_mgr_, rdc_module_mgr_))
, metrics_updater_(new RdcMetricsUpdaterImpl(watch_table_,
METIC_UPDATE_FREQUENCY)) {
if (mode == RDC_OPERATION_MODE_AUTO) {
+28 -9
Voir le fichier
@@ -30,6 +30,8 @@ namespace rdc {
RdcRasLib::RdcRasLib(const char* lib_name):
fields_value_get_(nullptr)
, fields_query_(nullptr)
, fields_watch_(nullptr)
, fields_unwatch_(nullptr)
, rdc_module_init_(nullptr)
, rdc_module_destroy_(nullptr) {
rdc_status_t status = lib_loader_.load(lib_name);
@@ -67,6 +69,16 @@ RdcRasLib::RdcRasLib(const char* lib_name):
if (status != RDC_ST_OK) {
fields_query_ = nullptr;
}
status = lib_loader_.load_symbol(&fields_watch_,
"rdc_telemetry_fields_watch");
if (status != RDC_ST_OK) {
fields_watch_ = nullptr;
}
status = lib_loader_.load_symbol(&fields_unwatch_,
"rdc_telemetry_fields_unwatch");
if (status != RDC_ST_OK) {
fields_unwatch_ = nullptr;
}
}
RdcRasLib::~RdcRasLib() {
@@ -85,8 +97,9 @@ rdc_status_t RdcRasLib::rdc_telemetry_fields_query(
return RDC_ST_FAIL_LOAD_MODULE;
}
auto status = fields_query_(field_ids, field_count);
RDC_LOG(RDC_DEBUG, "RAS support " << *field_count << " fields");
return fields_query_(field_ids, field_count);
return status;
}
rdc_status_t RdcRasLib::rdc_telemetry_fields_value_get(
@@ -107,24 +120,30 @@ rdc_status_t RdcRasLib::rdc_telemetry_fields_value_get(
rdc_status_t RdcRasLib::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields,
uint32_t fields_count) {
// TODO(bill_liu): Not Support yet
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
(void)fields;
(void)fields_count;
return RDC_ST_NOT_SUPPORTED;
if (!fields_watch_) {
return RDC_ST_FAIL_LOAD_MODULE;
}
rdc_status_t status = fields_watch_(fields, fields_count);
RDC_LOG(RDC_DEBUG, "Watch " << fields_count << " fields from RAS: "
<< rdc_status_string(status));
return status;
}
rdc_status_t RdcRasLib::rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields,
uint32_t fields_count) {
// TODO(bill_liu): Not Support yet
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
(void)fields;
(void)fields_count;
return RDC_ST_NOT_SUPPORTED;
if (!fields_unwatch_) {
return RDC_ST_FAIL_LOAD_MODULE;
}
rdc_status_t status = fields_unwatch_(fields, fields_count);
RDC_LOG(RDC_DEBUG, "Unwatch " << fields_count << " fields from RAS: "
<< rdc_status_string(status));
return status;
}
} // namespace rdc
+18 -8
Voir le fichier
@@ -74,24 +74,34 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields,
rdc_status_t RdcSmiLib::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields,
uint32_t fields_count) {
// TODO(bill_liu): Not Support yet
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
(void)fields;
(void)fields_count;
return RDC_ST_NOT_SUPPORTED;
for (uint32_t i = 0; i < fields_count; i++) {
metric_fetcher_->acquire_rsmi_handle(
{fields[i].gpu_index, fields[i].field_id});
}
RDC_LOG(RDC_DEBUG, "acquire " << fields_count
<< " field handles from rocm_smi_lib");
return RDC_ST_OK;
}
rdc_status_t RdcSmiLib::rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields,
uint32_t fields_count) {
// TODO(bill_liu): Not Support yet
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
(void)fields;
(void)fields_count;
return RDC_ST_NOT_SUPPORTED;
for (uint32_t i = 0; i < fields_count; i++) {
metric_fetcher_->delete_rsmi_handle(
{fields[i].gpu_index, fields[i].field_id});
}
RDC_LOG(RDC_DEBUG, "delete " << fields_count
<< " field handles from rocm_smi_lib");
return RDC_ST_OK;
}
rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(
+60 -28
Voir le fichier
@@ -35,10 +35,10 @@ rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_query(
}
auto ite = telemetry_modules_.begin();
*field_count = 0;
uint32_t count = 0;
for (; ite != telemetry_modules_.end(); ite++) {
uint32_t count = 0;
rdc_status_t status = (*ite)->rdc_telemetry_fields_query(
&(field_ids[count]), &count);
&(field_ids[*field_count]), &count);
if (status == RDC_ST_OK) {
*field_count += count;
}
@@ -52,11 +52,20 @@ rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_watch(
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
auto ite = telemetry_modules_.begin();
for (; ite != telemetry_modules_.end(); ite++) {
(*ite)->rdc_telemetry_fields_watch(
fields, fields_count);
std::map<RdcTelemetryPtr, std::vector<rdc_gpu_field_t>> fields_in_module;
std::vector<rdc_gpu_field_value_t> unsupport_fields;
get_fields_for_module(fields, fields_count,
fields_in_module, unsupport_fields);
auto ite = fields_in_module.begin();
for (; ite != fields_in_module.end(); ite++) {
if (ite->second.size() > 0) {
ite->first->rdc_telemetry_fields_watch(
&ite->second[0], ite->second.size());
}
}
return RDC_ST_OK;
}
@@ -66,11 +75,20 @@ rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_unwatch(
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
auto ite = telemetry_modules_.begin();
for (; ite != telemetry_modules_.end(); ite++) {
(*ite)->rdc_telemetry_fields_unwatch(
fields, fields_count);
std::map<RdcTelemetryPtr, std::vector<rdc_gpu_field_t>> fields_in_module;
std::vector<rdc_gpu_field_value_t> unsupport_fields;
get_fields_for_module(fields, fields_count,
fields_in_module, unsupport_fields);
auto ite = fields_in_module.begin();
for (; ite != fields_in_module.end(); ite++) {
if (ite->second.size() > 0) {
ite->first->rdc_telemetry_fields_unwatch(
&ite->second[0], ite->second.size());
}
}
return RDC_ST_OK;
}
@@ -86,14 +104,40 @@ RdcTelemetryModule::RdcTelemetryModule(
auto ite = telemetry_modules_.begin();
for (; ite != telemetry_modules_.end(); ite++) {
uint32_t field_ids[MAX_NUM_FIELDS];
uint32_t field_count;
(*ite)->rdc_telemetry_fields_query(field_ids, &field_count);
for (uint32_t index = 0; index < field_count; index++) {
fields_id_module_.insert({field_ids[index], (*ite)});
uint32_t field_count = 0;
rdc_status_t status = (*ite)->
rdc_telemetry_fields_query(field_ids, &field_count);
if (status == RDC_ST_OK) {
for (uint32_t index = 0; index < field_count; index++) {
fields_id_module_.insert({field_ids[index], (*ite)});
}
}
}
}
void RdcTelemetryModule::get_fields_for_module(
rdc_gpu_field_t* fields,
uint32_t fields_count,
std::map<RdcTelemetryPtr, std::vector<rdc_gpu_field_t>>&
fields_in_module,
std::vector<rdc_gpu_field_value_t>& unsupport_fields) {
for (uint32_t findex = 0; findex < fields_count; findex++) {
RdcTelemetryPtr module = fields_id_module_[fields[findex].field_id];
if (module) {
fields_in_module[module].push_back(fields[findex]);
} else {
RDC_LOG(RDC_DEBUG, "Unsupported field " <<
field_id_string(fields[findex].field_id));
rdc_gpu_field_value_t value;
value.gpu_index = fields[findex].gpu_index;
value.field_value.field_id = fields[findex].field_id;
value.field_value.status = RDC_ST_NOT_SUPPORTED;
unsupport_fields.push_back(value);
}
}
}
rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_value_get(
rdc_gpu_field_t* fields, uint32_t fields_count,
rdc_field_value_f callback, void* user_data) {
@@ -104,20 +148,8 @@ rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_value_get(
// Dispatch the fields to the libraries
std::map<RdcTelemetryPtr, std::vector<rdc_gpu_field_t>> fields_to_fetch;
std::vector<rdc_gpu_field_value_t> unsupport_fields;
for (uint32_t findex = 0; findex < fields_count; findex++) {
RdcTelemetryPtr module = fields_id_module_[fields[findex].field_id];
if (module) {
fields_to_fetch[module].push_back(fields[findex]);
} else {
RDC_LOG(RDC_DEBUG, "Unsupported field " <<
field_id_string(fields[findex].field_id));
rdc_gpu_field_value_t value;
value.gpu_index = fields[findex].gpu_index;
value.field_value.field_id = fields[findex].field_id;
value.field_value.status = RDC_ST_NOT_SUPPORTED;
unsupport_fields.push_back(value);
}
}
get_fields_for_module(fields, fields_count,
fields_to_fetch, unsupport_fields);
auto ite = fields_to_fetch.begin();
for (; ite != fields_to_fetch.end(); ite ++) {
+57 -14
Voir le fichier
@@ -38,11 +38,9 @@ namespace rdc {
RdcWatchTableImpl::RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings,
const RdcCacheManagerPtr& cache_mgr,
const RdcMetricFetcherPtr& metric_fetcher,
const RdcModuleMgrPtr& module_mgr):
group_settings_(group_settings)
, cache_mgr_(cache_mgr)
, metric_fetcher_(metric_fetcher)
, rdc_module_mgr_(module_mgr)
, last_cleanup_time_(0) {
}
@@ -211,15 +209,44 @@ rdc_status_t RdcWatchTableImpl::rdc_field_watch(rdc_gpu_group_t group_id,
return result;
}
// Skip not supported fields
uint32_t unsupported_fields = 0;
auto rdc_telemetry = rdc_module_mgr_->get_telemetry_module();
if (rdc_telemetry) {
uint32_t field_ids[MAX_NUM_FIELDS];
uint32_t field_count;
result = rdc_telemetry->
rdc_telemetry_fields_query(field_ids, &field_count);
if (result == RDC_ST_OK) {
RDC_LOG(RDC_DEBUG, "The system support "
<< field_count << " fields");
for (auto it = fields_in_watch.begin();
it != fields_in_watch.end(); ) {
bool not_supported = true;
for (uint32_t fi = 0; fi < field_count; fi++) {
if (field_ids[fi] == it->second) {
not_supported = false;
break;
}
}
if (not_supported) {
it = fields_in_watch.erase(it);
unsupported_fields++;
} else {
it++;
}
} // end for
} // end if
}
if ( unsupported_fields >0 ) {
RDC_LOG(RDC_DEBUG, "Skip watch " << unsupported_fields
<<" fields as they are not supported.");
}
// Update the fields_to_watch_
auto f_in_watch_iter = fields_in_watch.begin();
for (; f_in_watch_iter != fields_in_watch.end(); f_in_watch_iter++) {
// Skip not support fields
result = metric_fetcher_->acquire_rsmi_handle(*f_in_watch_iter);
if (result != RDC_ST_OK) {
continue;
}
auto ite = fields_to_watch_.find(*f_in_watch_iter);
if (ite == fields_to_watch_.end()) { // A new field
fields_to_watch_.insert({*f_in_watch_iter, f});
@@ -242,6 +269,21 @@ rdc_status_t RdcWatchTableImpl::rdc_field_watch(rdc_gpu_group_t group_id,
// Add to the watch table
watch_table_.insert({gkey, f});
// Notify the telemetry_module to watch those fields
if (rdc_telemetry) {
std::vector<rdc_gpu_field_t> fields;
auto fields_to_watch_iter = fields_to_watch_.begin();
for (; fields_to_watch_iter != fields_to_watch_.end();
fields_to_watch_iter++) {
if (fields_to_watch_iter->second.is_watching) {
fields.push_back({fields_to_watch_iter->first.first,
fields_to_watch_iter->first.second});
}
}
rdc_telemetry->rdc_telemetry_fields_watch(&fields[0],
fields.size());
}
return RDC_ST_OK;
}
@@ -293,27 +335,28 @@ rdc_status_t RdcWatchTableImpl::update_field_in_table_when_unwatch(
// Update the fields that impacted by this unwatch
auto fite = fields.begin();
std::vector<rdc_gpu_field_t> unwatch_fields;
for (; fite != fields.end(); fite++) {
auto f_in_table = fields_to_watch_.find((*fite));
if (f_in_table == fields_to_watch_.end()) { // Not in fields_to_watch_
unwatch_fields.push_back({fite->first, fite->second});
continue;
}
auto freq_iter = update_frequencies.find(*fite);
if (freq_iter == update_frequencies.end()) {
f_in_table->second.is_watching = false;
unwatch_fields.push_back({fite->first, fite->second});
} else {
f_in_table->second.update_freq = freq_iter->second;
}
}
fite = fields.begin();
for (; fite != fields.end(); fite++) {
result = metric_fetcher_->delete_rsmi_handle(*fite);
if (result != RDC_ST_OK && result != RDC_ST_NOT_SUPPORTED) {
return result;
}
// Notify the telemetry_module to unwatch those fields
auto rdc_telemetry = rdc_module_mgr_->get_telemetry_module();
if (rdc_telemetry) {
rdc_telemetry->rdc_telemetry_fields_unwatch(&unwatch_fields[0],
unwatch_fields.size());
}
return RDC_ST_OK;