9a2806ac95
RAS plugin loaded rocm-smi which is in conflict with amd-smi library
Main source of grief was the map 'devInfoTypesStrings' that is defined
in both rocm-smi and amd-smi
We assume that rocm-smi would get lazy-loaded by RAS library and
overwrite symbols defined in amd-smi. devInfoTypesStrings in rocm-smi
contains different number of elements, the enums are also different.
RDC relies on amd-smi's enums.
One such enum is kDevGpuMetrics:
rocm-smi: kDevGpuMetrics = 68
amd-smi: kDevGpuMetrics = 75
Example of overlapping map definitions:
$ objdump --dynamic-syms /opt/rocm/lib/libamd_smi.so | grep devInfoTypesStrings
00000000003c4980 g DO .data.rel.ro0000000000000008 Base devInfoTypesStrings
00000000003db830 g DO .bss0000000000000030 Base _ZN3amd3smi6Device19devInfoTypesStringsE
$ objdump --dynamic-syms /opt/rocm/lib/librocm_smi64.so | grep devInfoTypesStrings
00000000003dc590 g DO .bss0000000000000030 Base _ZN3amd3smi6Device19devInfoTypesStringsE
00000000003c9c68 g DO .data.rel.ro0000000000000008 Base devInfoTypesStrings
Change-Id: Ib2f2db32b6abd7ebe84e7807c25581461eb86bae
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
[ROCm/rdc commit: d85657e5f2]
108 lignes
3.9 KiB
C++
108 lignes
3.9 KiB
C++
/*
|
|
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
#ifndef INCLUDE_RDC_LIB_IMPL_RDCMETRICFETCHERIMPL_H_
|
|
#define INCLUDE_RDC_LIB_IMPL_RDCMETRICFETCHERIMPL_H_
|
|
|
|
#include <condition_variable> // NOLINT(build/c++11)
|
|
#include <future> // NOLINT(build/c++11)
|
|
#include <map>
|
|
#include <memory>
|
|
#include <mutex> // NOLINT(build/c++11)
|
|
#include <queue>
|
|
|
|
#include "amd_smi/amdsmi.h"
|
|
#include "rdc_lib/RdcMetricFetcher.h"
|
|
#include "rdc_lib/rdc_common.h"
|
|
|
|
namespace amd {
|
|
namespace rdc {
|
|
|
|
//!< Some metrics, like PCIe throughput may take a second to retreive. The
|
|
//!< MetricValue will cache those metrics for async retreive.
|
|
struct MetricValue {
|
|
uint64_t cache_ttl;
|
|
uint64_t last_time;
|
|
rdc_field_value value;
|
|
};
|
|
|
|
// This union represents any SMI handles require initialization and/or
|
|
// shut down. There should only be one instance of this for each raw event
|
|
// used. For example, if a field group includes a pseudo-event and the
|
|
// underlying raw event, then only one FieldSMIData should be created,
|
|
// and it should be used by both events.
|
|
struct FieldSMIData {
|
|
union {
|
|
amdsmi_event_handle_t evt_handle;
|
|
};
|
|
union {
|
|
amdsmi_counter_value_t counter_val;
|
|
};
|
|
~FieldSMIData() {}
|
|
FieldSMIData() : evt_handle(0), counter_val{0, 0, 0} {}
|
|
};
|
|
|
|
//!< The data structure to store the async fetch task
|
|
class RdcMetricFetcherImpl;
|
|
struct MetricTask {
|
|
RdcFieldKey field;
|
|
std::function<void(RdcMetricFetcherImpl&, RdcFieldKey)> task;
|
|
};
|
|
|
|
class RdcMetricFetcherImpl final : public RdcMetricFetcher {
|
|
public:
|
|
rdc_status_t fetch_smi_field(uint32_t gpu_index, rdc_field_t field_id,
|
|
rdc_field_value* value) override;
|
|
rdc_status_t bulk_fetch_smi_fields(
|
|
rdc_gpu_field_t* fields, uint32_t fields_count,
|
|
std::vector<rdc_gpu_field_value_t>& results) override; // NOLINT
|
|
RdcMetricFetcherImpl();
|
|
~RdcMetricFetcherImpl() final;
|
|
|
|
rdc_status_t acquire_smi_handle(RdcFieldKey fk) override;
|
|
rdc_status_t delete_smi_handle(RdcFieldKey fk) override;
|
|
|
|
private:
|
|
std::shared_ptr<FieldSMIData> get_smi_data(RdcFieldKey key);
|
|
|
|
uint64_t now();
|
|
void get_ecc(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value);
|
|
void get_ecc_total(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value);
|
|
|
|
//!< return true if starting async_get
|
|
bool async_get_pcie_throughput(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value);
|
|
void get_pcie_throughput(const RdcFieldKey& key);
|
|
|
|
//!< Async metric retreive
|
|
std::map<RdcFieldKey, MetricValue> async_metrics_;
|
|
std::map<RdcFieldKey, std::shared_ptr<FieldSMIData>> smi_data_;
|
|
std::queue<MetricTask> updated_tasks_;
|
|
std::mutex task_mutex_;
|
|
std::future<void> updater_; // keep the future of updater
|
|
std::condition_variable cv_;
|
|
std::atomic<bool> task_started_;
|
|
};
|
|
|
|
} // namespace rdc
|
|
} // namespace amd
|
|
|
|
#endif // INCLUDE_RDC_LIB_IMPL_RDCMETRICFETCHERIMPL_H_
|