diff --git a/common/rdc_field_data.data b/common/rdc_field_data.data index a6f40d34ff..6c389fea6d 100644 --- a/common/rdc_field_data.data +++ b/common/rdc_field_data.data @@ -29,21 +29,21 @@ THE SOFTWARE. // 4 bool do or do not display in rdci // rdc_field_t Description rdci label To Display // =========== =========== ========= ========== -FLD_DESC_ENT(RDC_FI_INVALID, "Unknown/Invalid field", "INVALID", true) +FLD_DESC_ENT(RDC_FI_INVALID, "Unknown/Invalid field", "INVALID", false) FLD_DESC_ENT(RDC_FI_GPU_COUNT, "GPU count in the system", "GPU_COUNT", true) FLD_DESC_ENT(RDC_FI_DEV_NAME, "Name of the device", "DEV_NAME", true) -FLD_DESC_ENT(RDC_FI_GPU_CLOCK, "Current GPU clock frequencies.", "GPU_CLOCK", true) -FLD_DESC_ENT(RDC_FI_MEM_CLOCK, "Current Memory clock frequencies.", "MEM_CLOCK", true) -FLD_DESC_ENT(RDC_FI_MEMORY_TEMP, "Memory temperature in millidegrees Celsius.", "MEMORY_TEMP", true) -FLD_DESC_ENT(RDC_FI_GPU_TEMP, "GPU temperature in millidegrees Celsius.", "GPU_TEMP", true) -FLD_DESC_ENT(RDC_FI_POWER_USAGE, "Power usage in microwatts.", "POWER_USAGE", true) -FLD_DESC_ENT(RDC_FI_PCIE_TX, "PCIe Tx utilization in bytes/second.", "PCIE_TX", true) -FLD_DESC_ENT(RDC_FI_PCIE_RX, "PCIe Rx utilization in bytes/second.", "PCIE_RX", true) -FLD_DESC_ENT(RDC_FI_GPU_UTIL, "GPU busy percentage.", "GPU_UTIL", true) -FLD_DESC_ENT(RDC_FI_GPU_MEMORY_USAGE, "Memory usage of the GPU instance in bytes.", "GPU_MEMORY_USAGE", true) +FLD_DESC_ENT(RDC_FI_GPU_CLOCK, "Current GPU clock frequencies", "GPU_CLOCK", true) +FLD_DESC_ENT(RDC_FI_MEM_CLOCK, "Current Memory clock frequencies", "MEM_CLOCK", true) +FLD_DESC_ENT(RDC_FI_MEMORY_TEMP, "Memory temperature in millidegrees Celsius", "MEMORY_TEMP", true) +FLD_DESC_ENT(RDC_FI_GPU_TEMP, "GPU temperature in millidegrees Celsius", "GPU_TEMP", true) +FLD_DESC_ENT(RDC_FI_POWER_USAGE, "Power usage in microwatts", "POWER_USAGE", true) +FLD_DESC_ENT(RDC_FI_PCIE_TX, "PCIe Tx utilization in bytes/second", "PCIE_TX", true) +FLD_DESC_ENT(RDC_FI_PCIE_RX, "PCIe Rx utilization in bytes/second", "PCIE_RX", true) +FLD_DESC_ENT(RDC_FI_GPU_UTIL, "GPU busy percentage", "GPU_UTIL", true) +FLD_DESC_ENT(RDC_FI_GPU_MEMORY_USAGE, "Memory usage of the GPU instance in bytes", "GPU_MEMORY_USAGE", true) FLD_DESC_ENT(RDC_FI_GPU_MEMORY_TOTAL, "Total memory of the GPU instance", "GPU_MEMORY_TOTAL", true) -FLD_DESC_ENT(RDC_FI_ECC_CORRECT_TOTAL, "Accumulated correctable ECC errors.", "ECC_CORRECT", true) -FLD_DESC_ENT(RDC_FI_ECC_UNCORRECT_TOTAL, "Accumulated uncorrectable ECC errors.", "ECC_UNCORRECT", true) +FLD_DESC_ENT(RDC_FI_ECC_CORRECT_TOTAL, "Accumulated correctable ECC errors", "ECC_CORRECT", true) +FLD_DESC_ENT(RDC_FI_ECC_UNCORRECT_TOTAL, "Accumulated uncorrectable ECC errors", "ECC_UNCORRECT", true) FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX, "NOPs sent to neighbor 0", "XGMI_NOP_0", false) FLD_DESC_ENT(RDC_EVNT_XGMI_0_REQ_TX, "Outgoing requests to neighbor 0", "XGMI_REQ_0", false) diff --git a/include/rdc_lib/RdcLibraryLoader.h b/include/rdc_lib/RdcLibraryLoader.h index 2b3c1a4352..dcc6be5bd3 100644 --- a/include/rdc_lib/RdcLibraryLoader.h +++ b/include/rdc_lib/RdcLibraryLoader.h @@ -36,6 +36,11 @@ class RdcLibraryLoader { public: RdcLibraryLoader(); + rdc_status_t load(const char* filename); + + template rdc_status_t load_symbol(T* func_handler, + const char* func_name); + template rdc_status_t load(const char* filename, T* func_make_handler); @@ -48,35 +53,44 @@ class RdcLibraryLoader { std::mutex library_mutex_; }; +template rdc_status_t RdcLibraryLoader::load_symbol(T* func_handler, + const char* func_name) { + if (!libHandler_) { + RDC_LOG(RDC_ERROR, "Must load the library before load the symbol"); + return RDC_ST_FAIL_LOAD_MODULE; + } + + if (!func_handler || !func_name) { + return RDC_ST_FAIL_LOAD_MODULE; + } + + std::lock_guard guard(library_mutex_); + + *reinterpret_cast(func_handler) = + dlsym(libHandler_, func_name); + if (*func_handler == nullptr) { + char* error = dlerror(); + RDC_LOG(RDC_ERROR, "RdcLibraryLoader: Fail to load the symbol " + << func_name << ": " << error); + return RDC_ST_FAIL_LOAD_MODULE; + } + + return RDC_ST_OK; +} + + template rdc_status_t RdcLibraryLoader::load(const char* filename, T* func_make_handler) { if (filename == nullptr || func_make_handler == nullptr) { return RDC_ST_FAIL_LOAD_MODULE; } - if (libHandler_) { - unload(); + rdc_status_t status = load(filename); + if (status != RDC_ST_OK) { + return status; } - std::lock_guard guard(library_mutex_); - libHandler_ = dlopen(filename, RTLD_LAZY); - if (!libHandler_) { - char* error = dlerror(); - RDC_LOG(RDC_ERROR, "Fail to open " << filename <<": " << error); - return RDC_ST_FAIL_LOAD_MODULE; - } - - *reinterpret_cast(func_make_handler) = - dlsym(libHandler_, "make_handler"); - if (*func_make_handler == nullptr) { - char* error = dlerror(); - RDC_LOG(RDC_ERROR, - "Fail to find function make_handler from file " - << filename <<": " << error); - return RDC_ST_FAIL_LOAD_MODULE; - } - - return RDC_ST_OK; + return load_symbol(func_make_handler, "make_handler"); } } // namespace rdc diff --git a/include/rdc_lib/RdcModuleMgr.h b/include/rdc_lib/RdcModuleMgr.h new file mode 100644 index 0000000000..bf5b23aa4f --- /dev/null +++ b/include/rdc_lib/RdcModuleMgr.h @@ -0,0 +1,44 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef RDC_LIB_RDCMODULEMGR_H_ +#define RDC_LIB_RDCMODULEMGR_H_ + +#include +#include "rdc_lib/rdc_common.h" +#include "rdc/rdc.h" +#include "rdc_lib/RdcTelemetry.h" + +namespace amd { +namespace rdc { + +class RdcModuleMgr { + public: + virtual RdcTelemetryPtr get_telemetry_module() = 0; +}; + +typedef std::shared_ptr RdcModuleMgrPtr; + +} // namespace rdc +} // namespace amd + + +#endif // RDC_LIB_RDCMODULEMGR_H_ diff --git a/include/rdc_lib/RdcTelemetry.h b/include/rdc_lib/RdcTelemetry.h new file mode 100644 index 0000000000..0370643079 --- /dev/null +++ b/include/rdc_lib/RdcTelemetry.h @@ -0,0 +1,72 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef RDC_LIB_RDCTELEMETRY_H_ +#define RDC_LIB_RDCTELEMETRY_H_ + +#include +#include "rdc/rdc.h" + +namespace amd { +namespace rdc { + +// Structure to keep both gup index and field value +typedef struct { + uint32_t gpu_index; + rdc_field_value field_value; +} rdc_gpu_field_value_t; + + +typedef struct { + uint32_t gpu_index; + rdc_field_t field_id; +} rdc_gpu_field_t; + +#define MAX_NUM_FIELDS 8192 +typedef rdc_status_t(*rdc_field_value_f)(rdc_gpu_field_value_t* values, + uint32_t num_values, void* user_data); + +class RdcTelemetry { + public: + // get support field ids + virtual rdc_status_t rdc_telemetry_fields_query( + uint32_t field_ids[MAX_NUM_FIELDS], + uint32_t* field_count) = 0; + + // Fetch + virtual rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, + uint32_t fields_count, rdc_field_value_f callback, + void* user_data) = 0; + + virtual rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, + uint32_t fields_count) = 0; + virtual rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, + uint32_t fields_count) = 0; + + virtual ~RdcTelemetry() {} +}; +typedef std::shared_ptr RdcTelemetryPtr; + +} // namespace rdc +} // namespace amd + + +#endif // RDC_LIB_RDCTELEMETRY_H_ diff --git a/include/rdc_lib/impl/RdcEmbeddedHandler.h b/include/rdc_lib/impl/RdcEmbeddedHandler.h index 0541be031f..2ba6aaf808 100644 --- a/include/rdc_lib/impl/RdcEmbeddedHandler.h +++ b/include/rdc_lib/impl/RdcEmbeddedHandler.h @@ -29,6 +29,7 @@ THE SOFTWARE. #include "rdc_lib/RdcCacheManager.h" #include "rdc_lib/RdcMetricsUpdater.h" #include "rdc_lib/RdcWatchTable.h" +#include "rdc_lib/RdcModuleMgr.h" namespace amd { namespace rdc { @@ -96,6 +97,7 @@ class RdcEmbeddedHandler: public RdcHandler { RdcGroupSettingsPtr group_settings_; RdcCacheManagerPtr cache_mgr_; RdcMetricFetcherPtr metric_fetcher_; + RdcModuleMgrPtr rdc_module_mgr_; RdcWatchTablePtr watch_table_; RdcMetricsUpdaterPtr metrics_updater_; std::future updater_; diff --git a/include/rdc_lib/impl/RdcModuleMgrImpl.h b/include/rdc_lib/impl/RdcModuleMgrImpl.h new file mode 100644 index 0000000000..6e0706fdd8 --- /dev/null +++ b/include/rdc_lib/impl/RdcModuleMgrImpl.h @@ -0,0 +1,53 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef RDC_LIB_IMPL_RDCMODULEMGRIMPL_H_ +#define RDC_LIB_IMPL_RDCMODULEMGRIMPL_H_ + +#include +#include "rdc_lib/RdcModuleMgr.h" +#include "rdc_lib/RdcMetricFetcher.h" +#include "rdc_lib/RdcTelemetry.h" +#include "rdc_lib/impl/RdcRasLib.h" + +namespace amd { +namespace rdc { + +class RdcModuleMgrImpl: public RdcModuleMgr { + public: + RdcTelemetryPtr get_telemetry_module() override; + explicit RdcModuleMgrImpl(const RdcMetricFetcherPtr& fetcher); + private: + // Function module + RdcTelemetryPtr rdc_telemetry_module_; + + // Domain module + RdcRasLibPtr ras_lib_; + + RdcMetricFetcherPtr fetcher_; + +}; + +} // namespace rdc +} // namespace amd + + +#endif // RDC_LIB_IMPL_RDCMODULEMGRIMPL_H_ diff --git a/include/rdc_lib/impl/RdcRasLib.h b/include/rdc_lib/impl/RdcRasLib.h new file mode 100644 index 0000000000..e7e1ef703c --- /dev/null +++ b/include/rdc_lib/impl/RdcRasLib.h @@ -0,0 +1,69 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef RDC_LIB_IMPL_RDCRASLIB_H_ +#define RDC_LIB_IMPL_RDCRASLIB_H_ + +#include +#include +#include +#include +#include +#include +#include "rdc_lib/RdcLibraryLoader.h" +#include "rdc_lib/RdcTelemetry.h" + + +namespace amd { +namespace rdc { +class RdcRasLib: public RdcTelemetry { + public: + // get support field ids + rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], + uint32_t* field_count) override; + + // Fetch + rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, + uint32_t fields_count, rdc_field_value_f callback, + void* user_data) override; + + rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, + uint32_t fields_count) override; + + rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, + uint32_t fields_count) override; + + explicit RdcRasLib(const char* lib_name); + + private: + RdcLibraryLoader lib_loader_; + rdc_status_t (*fields_value_get_)(rdc_gpu_field_t*, + uint32_t, rdc_field_value_f, void*); + rdc_status_t (*fields_query_)(uint32_t[MAX_NUM_FIELDS], uint32_t*); +}; +typedef std::shared_ptr RdcRasLibPtr; + + +} // namespace rdc +} // namespace amd + + +#endif // RDC_LIB_IMPL_RDCRASLIB_H_ diff --git a/include/rdc_lib/impl/RdcSmiLib.h b/include/rdc_lib/impl/RdcSmiLib.h new file mode 100644 index 0000000000..88d7e4444c --- /dev/null +++ b/include/rdc_lib/impl/RdcSmiLib.h @@ -0,0 +1,58 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef RDC_LIB_IMPL_RDCSMILIB_H_ +#define RDC_LIB_IMPL_RDCSMILIB_H_ + +#include +#include +#include "rdc_lib/RdcMetricFetcher.h" +#include "rdc_lib/RdcTelemetry.h" + +namespace amd { +namespace rdc { + +class RdcSmiLib : public RdcTelemetry { + public: + // get support field ids + rdc_status_t rdc_telemetry_fields_query( + uint32_t field_ids[MAX_NUM_FIELDS], uint32_t* field_count) override; + + // Fetch + rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, + uint32_t fields_count, rdc_field_value_f callback, + void* user_data) override; + + rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, + uint32_t fields_count) override; + rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, + uint32_t fields_count) override; + + explicit RdcSmiLib(const RdcMetricFetcherPtr& mf); + + private: + RdcMetricFetcherPtr metric_fetcher_; +}; + +} // namespace rdc +} // namespace amd + +#endif // RDC_LIB_IMPL_RDCSMILIB_H_ diff --git a/include/rdc_lib/impl/RdcTelemetryModule.h b/include/rdc_lib/impl/RdcTelemetryModule.h new file mode 100644 index 0000000000..3a67e5bda3 --- /dev/null +++ b/include/rdc_lib/impl/RdcTelemetryModule.h @@ -0,0 +1,63 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef RDC_LIB_IMPL_RDCTELEMETRYMODULE_H_ +#define RDC_LIB_IMPL_RDCTELEMETRYMODULE_H_ + +#include +#include +#include +#include "rdc_lib/RdcTelemetry.h" +#include "rdc_lib/impl/RdcRasLib.h" +#include "rdc_lib/RdcMetricFetcher.h" + +namespace amd { +namespace rdc { + +class RdcTelemetryModule : public RdcTelemetry { + public: + rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, + uint32_t fields_count, rdc_field_value_f callback, + void* user_data); + + rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], + uint32_t* field_count); + + rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, + uint32_t fields_count); + + rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, + uint32_t fields_count); + + RdcTelemetryModule(const RdcMetricFetcherPtr& fetcher, + const RdcRasLibPtr& ras_module); + private: + std::list telemetry_modules_; + std::map fields_id_module_; +}; + +typedef std::shared_ptr RdcTelemetryModulePtr; + +} // namespace rdc +} // namespace amd + + +#endif // RDC_LIB_IMPL_RDCTELEMETRYMODULE_H_ diff --git a/include/rdc_lib/impl/RdcWatchTableImpl.h b/include/rdc_lib/impl/RdcWatchTableImpl.h index 4216e80968..c2c2b3fde6 100644 --- a/include/rdc_lib/impl/RdcWatchTableImpl.h +++ b/include/rdc_lib/impl/RdcWatchTableImpl.h @@ -29,11 +29,11 @@ THE SOFTWARE. #include #include // NOLINT #include -#include #include "rdc_lib/RdcWatchTable.h" #include "rdc_lib/RdcGroupSettings.h" #include "rdc_lib/RdcCacheManager.h" #include "rdc_lib/RdcMetricFetcher.h" +#include "rdc_lib/RdcModuleMgr.h" #include "rocm_smi/rocm_smi.h" namespace amd { @@ -83,9 +83,11 @@ class RdcWatchTableImpl : public RdcWatchTable { //!< once per second. rdc_status_t rdc_field_update_all() override; + // TODO(bill_liu): Remove the RdcMetricFetcherPtr RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings, const RdcCacheManagerPtr& cache_mgr, - const RdcMetricFetcherPtr& metric_fetcher); + const RdcMetricFetcherPtr& metric_fetcher, + const RdcModuleMgrPtr& module_mgr); private: //!< Helper function to Update the fields_in_table when unwatch tables @@ -108,9 +110,14 @@ class RdcWatchTableImpl : public RdcWatchTable { rdc_status_t initialize_rsmi_handles(RdcFieldKey fk); + //!< The function will be pass as the callback for bulk fetch + static rdc_status_t handle_fields(rdc_gpu_field_value_t* values, + uint32_t num_values, void* user_data); + RdcGroupSettingsPtr group_settings_; RdcCacheManagerPtr cache_mgr_; RdcMetricFetcherPtr metric_fetcher_; + RdcModuleMgrPtr rdc_module_mgr_; //!< The watch table to store the watch settings. std::map watch_table_; diff --git a/rdc_libs/CMakeLists.txt b/rdc_libs/CMakeLists.txt index aa5e7ed2a2..b2a3852238 100755 --- a/rdc_libs/CMakeLists.txt +++ b/rdc_libs/CMakeLists.txt @@ -146,6 +146,10 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcGroupSettingsImp set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcCacheManagerImpl.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcMetricsUpdaterImpl.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcWatchTableImpl.cc") +set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcRasLib.cc") +set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcSmiLib.cc") +set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcTelemetryModule.cc") +set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcModuleMgrImpl.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${COMMON_DIR}/rdc_fields_supported.cc") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcEmbeddedHandler.h") @@ -159,6 +163,12 @@ set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcMetricsU set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcMetricsUpdaterImpl.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcWatchTable.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcWatchTableImpl.h") +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcRasLib.h") +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcSmiLib.h") +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcModuleMgrImpl.h") +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcModuleMgr.h") +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcTelemetry.h") +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcTelemetryModule.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${COMMON_DIR}/rdc_fields_supported.h") message("RDC_LIB_INC_LIST=${RDC_LIB_INC_LIST}") diff --git a/rdc_libs/bootstrap/src/RdcLibraryLoader.cc b/rdc_libs/bootstrap/src/RdcLibraryLoader.cc index 014a661a94..eb56133760 100644 --- a/rdc_libs/bootstrap/src/RdcLibraryLoader.cc +++ b/rdc_libs/bootstrap/src/RdcLibraryLoader.cc @@ -28,6 +28,25 @@ namespace rdc { RdcLibraryLoader::RdcLibraryLoader(): libHandler_(nullptr) { } +rdc_status_t RdcLibraryLoader::load(const char* filename) { + if (filename == nullptr) { + return RDC_ST_FAIL_LOAD_MODULE; + } + if (libHandler_) { + unload(); + } + + std::lock_guard guard(library_mutex_); + libHandler_ = dlopen(filename, RTLD_LAZY); + if (!libHandler_) { + char* error = dlerror(); + RDC_LOG(RDC_ERROR, "Fail to open " << filename <<": " << error); + return RDC_ST_FAIL_LOAD_MODULE; + } + + return RDC_ST_OK; +} + rdc_status_t RdcLibraryLoader::unload() { std::lock_guard guard(library_mutex_); if (libHandler_) { diff --git a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc index 2add0edd13..414378853b 100644 --- a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc +++ b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc @@ -26,6 +26,7 @@ THE SOFTWARE. #include "rdc_lib/impl/RdcMetricsUpdaterImpl.h" #include "rdc_lib/impl/RdcCacheManagerImpl.h" #include "rdc_lib/impl/RdcWatchTableImpl.h" +#include "rdc_lib/impl/RdcModuleMgrImpl.h" #include "rdc_lib/rdc_common.h" #include "rdc_lib/RdcLogger.h" #include "rdc_lib/RdcException.h" @@ -70,8 +71,9 @@ RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode): group_settings_(new RdcGroupSettingsImpl()) , cache_mgr_(new RdcCacheManagerImpl()) , metric_fetcher_(new RdcMetricFetcherImpl()) + , rdc_module_mgr_(new RdcModuleMgrImpl(metric_fetcher_)) , watch_table_(new RdcWatchTableImpl(group_settings_, - cache_mgr_, metric_fetcher_)) + cache_mgr_, metric_fetcher_, rdc_module_mgr_)) , metrics_updater_(new RdcMetricsUpdaterImpl(watch_table_, METIC_UPDATE_FREQUENCY)) { if (mode == RDC_OPERATION_MODE_AUTO) { diff --git a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index 7760a97fcc..192452232c 100644 --- a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -252,8 +252,12 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, value->status = RSMI_STATUS_NOT_SUPPORTED; auto read_rsmi_counter = [&](void) { - assert(get_rsmi_data(f_key) != nullptr); rsmi_data = get_rsmi_data(f_key); + if (rsmi_data == nullptr) { + value->status = RSMI_STATUS_NOT_SUPPORTED; + return; + } + value->status = rsmi_counter_read(rsmi_data->evt_handle, &rsmi_data->counter_val); value->value.l_int = rsmi_data->counter_val.value; diff --git a/rdc_libs/rdc/src/RdcModuleMgrImpl.cc b/rdc_libs/rdc/src/RdcModuleMgrImpl.cc new file mode 100644 index 0000000000..2dc0368935 --- /dev/null +++ b/rdc_libs/rdc/src/RdcModuleMgrImpl.cc @@ -0,0 +1,53 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "rdc_lib/impl/RdcModuleMgrImpl.h" +#include "rdc_lib/impl/RdcTelemetryModule.h" +#include "rdc_lib/impl/RdcRasLib.h" + +namespace amd { +namespace rdc { + +RdcModuleMgrImpl::RdcModuleMgrImpl(const RdcMetricFetcherPtr& fetcher): + fetcher_(fetcher) { +} + + +RdcTelemetryPtr RdcModuleMgrImpl::get_telemetry_module() { + if (rdc_telemetry_module_) { + return rdc_telemetry_module_; + } + + // Delay load + if (!ras_lib_) { + ras_lib_.reset(new RdcRasLib("librdc_ras.so")); + } + + if (!rdc_telemetry_module_) { + rdc_telemetry_module_.reset(new RdcTelemetryModule(fetcher_, ras_lib_)); + } + + return rdc_telemetry_module_; +} + +} // namespace rdc +} // namespace amd + diff --git a/rdc_libs/rdc/src/RdcRasLib.cc b/rdc_libs/rdc/src/RdcRasLib.cc new file mode 100644 index 0000000000..3de78cdb28 --- /dev/null +++ b/rdc_libs/rdc/src/RdcRasLib.cc @@ -0,0 +1,104 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include +#include "rdc_lib/rdc_common.h" +#include "rdc_lib/RdcLogger.h" +#include "rdc_lib/impl/RdcRasLib.h" + +namespace amd { +namespace rdc { + +RdcRasLib::RdcRasLib(const char* lib_name) { + rdc_status_t status = lib_loader_.load(lib_name); + if (status != RDC_ST_OK) { + fields_value_get_ = nullptr; + fields_query_ = nullptr; + return; + } + + status = lib_loader_.load_symbol(&fields_value_get_, + "rdc_telemetry_fields_value_get"); + if (status != RDC_ST_OK) { + fields_value_get_ = nullptr; + } + status = lib_loader_.load_symbol(&fields_query_, + "rdc_telemetry_fields_query"); + if (status != RDC_ST_OK) { + fields_query_ = nullptr; + } +} + +rdc_status_t RdcRasLib::rdc_telemetry_fields_query( + uint32_t field_ids[MAX_NUM_FIELDS], uint32_t* field_count) { + if (field_count == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + if (!fields_query_) { + *field_count = 0; + return RDC_ST_FAIL_LOAD_MODULE; + } + + RDC_LOG(RDC_DEBUG, "RAS support " << *field_count << " fields"); + return fields_query_(field_ids, field_count); +} + +rdc_status_t RdcRasLib::rdc_telemetry_fields_value_get( + rdc_gpu_field_t* fields, uint32_t fields_count, rdc_field_value_f callback, + void* user_data) { + if (fields == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + if (!fields_value_get_) { + return RDC_ST_FAIL_LOAD_MODULE; + } + rdc_status_t status = fields_value_get_(fields, + fields_count, callback, user_data); + RDC_LOG(RDC_DEBUG, "Bulk fetched " << fields_count << " fields from RAS: " + << rdc_status_string(status)); + return status; +} + +rdc_status_t RdcRasLib::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, + uint32_t fields_count) { + // TODO(bill_liu): Not Support yet + if (fields == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + (void)fields; + (void)fields_count; + return RDC_ST_NOT_SUPPORTED; +} + +rdc_status_t RdcRasLib::rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, + uint32_t fields_count) { + // TODO(bill_liu): Not Support yet + if (fields == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + (void)fields; + (void)fields_count; + return RDC_ST_NOT_SUPPORTED; +} + +} // namespace rdc +} // namespace amd + diff --git a/rdc_libs/rdc/src/RdcSmiLib.cc b/rdc_libs/rdc/src/RdcSmiLib.cc new file mode 100644 index 0000000000..43b2dcbe6c --- /dev/null +++ b/rdc_libs/rdc/src/RdcSmiLib.cc @@ -0,0 +1,125 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include +#include "rdc_lib/rdc_common.h" +#include "rdc_lib/RdcLogger.h" +#include "rdc_lib/impl/RdcSmiLib.h" + +namespace amd { +namespace rdc { + +RdcSmiLib::RdcSmiLib(const RdcMetricFetcherPtr& mf): metric_fetcher_(mf) { +} + +// Bulk fetch wrapper for the rocm_smi_lib. This will be replaced after +// rocm_smi_lib can support bulk fetch. +rdc_status_t RdcSmiLib::rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, + uint32_t fields_count, rdc_field_value_f callback, + void* user_data) { + if (fields == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + + RDC_LOG(RDC_DEBUG, "Bulk fetch " << fields_count + << " fields from rocm_smi_lib."); + + const int BULK_FIELDS_MAX = 16; + rdc_gpu_field_value_t values[BULK_FIELDS_MAX]; + uint32_t bulk_count = 0; + for (uint32_t i = 0; i < fields_count; i++) { + if (bulk_count >= BULK_FIELDS_MAX) { + rdc_status_t status = callback(values, bulk_count, user_data); + // When the callback returns errors, stop processing and return. + if (status != RDC_ST_OK) { + return status; + } + bulk_count = 0; + } + values[bulk_count].gpu_index = fields[i].gpu_index; + metric_fetcher_->fetch_smi_field( + fields[i].gpu_index, + static_cast(fields[i].field_id), + &(values[bulk_count].field_value)); + bulk_count++; + } + if (bulk_count != 0) { + rdc_status_t status = callback(values, bulk_count, user_data); + if (status != RDC_ST_OK) { + return status; + } + bulk_count = 0; + } + + return RDC_ST_OK; +} + +rdc_status_t RdcSmiLib::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, + uint32_t fields_count) { + // TODO(bill_liu): Not Support yet + if (fields == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + (void)fields; + (void)fields_count; + return RDC_ST_NOT_SUPPORTED; +} + +rdc_status_t RdcSmiLib::rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, + uint32_t fields_count) { + // TODO(bill_liu): Not Support yet + if (fields == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + (void)fields; + (void)fields_count; + return RDC_ST_NOT_SUPPORTED; +} + +rdc_status_t RdcSmiLib::rdc_telemetry_fields_query( + uint32_t field_ids[MAX_NUM_FIELDS], + uint32_t* field_count) { + if (field_count == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + + // List of fields supported by rocm_smi_lib + const std::vector fields{ + RDC_FI_GPU_COUNT, RDC_FI_DEV_NAME, RDC_FI_GPU_CLOCK, + RDC_FI_MEM_CLOCK, RDC_FI_MEMORY_TEMP, RDC_FI_GPU_TEMP, + RDC_FI_POWER_USAGE, RDC_FI_PCIE_TX, RDC_FI_PCIE_RX, + RDC_FI_GPU_UTIL, RDC_FI_GPU_MEMORY_USAGE, RDC_FI_GPU_MEMORY_TOTAL, + RDC_FI_ECC_CORRECT_TOTAL, RDC_FI_ECC_UNCORRECT_TOTAL, + RDC_EVNT_XGMI_0_NOP_TX, RDC_EVNT_XGMI_0_REQ_TX, + RDC_EVNT_XGMI_0_RESP_TX, RDC_EVNT_XGMI_0_BEATS_TX, + RDC_EVNT_XGMI_1_NOP_TX, RDC_EVNT_XGMI_1_REQ_TX, + RDC_EVNT_XGMI_1_RESP_TX, RDC_EVNT_XGMI_1_BEATS_TX, + RDC_EVNT_XGMI_0_THRPUT, RDC_EVNT_XGMI_1_THRPUT + }; + std::copy(fields.begin(), fields.end(), field_ids); + *field_count = fields.size(); + + return RDC_ST_OK; +} + +} // namespace rdc +} // namespace amd + diff --git a/rdc_libs/rdc/src/RdcTelemetryModule.cc b/rdc_libs/rdc/src/RdcTelemetryModule.cc new file mode 100644 index 0000000000..ddda670938 --- /dev/null +++ b/rdc_libs/rdc/src/RdcTelemetryModule.cc @@ -0,0 +1,138 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "rdc_lib/impl/RdcTelemetryModule.h" +#include +#include "rdc_lib/RdcLogger.h" +#include "rdc_lib/impl/RdcSmiLib.h" + +namespace amd { +namespace rdc { + +// Return all supported fields +rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_query( + uint32_t field_ids[MAX_NUM_FIELDS], uint32_t* field_count) { + if (field_count == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + auto ite = telemetry_modules_.begin(); + *field_count = 0; + uint32_t count = 0; + for (; ite != telemetry_modules_.end(); ite++) { + rdc_status_t status = (*ite)->rdc_telemetry_fields_query( + &(field_ids[count]), &count); + if (status == RDC_ST_OK) { + *field_count += count; + } + } + + return RDC_ST_OK; +} + +rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_watch( + rdc_gpu_field_t* fields, uint32_t fields_count) { + if (fields == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + auto ite = telemetry_modules_.begin(); + for (; ite != telemetry_modules_.end(); ite++) { + (*ite)->rdc_telemetry_fields_watch( + fields, fields_count); + } + return RDC_ST_OK; +} + + +rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_unwatch( + rdc_gpu_field_t* fields, uint32_t fields_count) { + if (fields == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + auto ite = telemetry_modules_.begin(); + for (; ite != telemetry_modules_.end(); ite++) { + (*ite)->rdc_telemetry_fields_unwatch( + fields, fields_count); + } + return RDC_ST_OK; +} + +RdcTelemetryModule::RdcTelemetryModule( + const RdcMetricFetcherPtr& fetcher, + const RdcRasLibPtr& ras_module) { + auto smi_telemetry_module = std::make_shared(fetcher); + telemetry_modules_.push_back(smi_telemetry_module); + if (ras_module) { + telemetry_modules_.push_back(ras_module); + } + + auto ite = telemetry_modules_.begin(); + for (; ite != telemetry_modules_.end(); ite++) { + uint32_t field_ids[MAX_NUM_FIELDS]; + uint32_t field_count; + (*ite)->rdc_telemetry_fields_query(field_ids, &field_count); + for (uint32_t index = 0; index < field_count; index++) { + fields_id_module_.insert({field_ids[index], (*ite)}); + } + } +} + +rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_value_get( + rdc_gpu_field_t* fields, uint32_t fields_count, + rdc_field_value_f callback, void* user_data) { + if (fields == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + + // Dispatch the fields to the libraries + std::map> fields_to_fetch; + std::vector unsupport_fields; + for (uint32_t findex = 0; findex < fields_count; findex++) { + RdcTelemetryPtr module = fields_id_module_[fields[findex].field_id]; + if (module) { + fields_to_fetch[module].push_back(fields[findex]); + } else { + RDC_LOG(RDC_DEBUG, "Unsupported field " << + field_id_string(fields[findex].field_id)); + rdc_gpu_field_value_t value; + value.gpu_index = fields[findex].gpu_index; + value.field_value.field_id = fields[findex].field_id; + value.field_value.status = RDC_ST_NOT_SUPPORTED; + unsupport_fields.push_back(value); + } + } + + auto ite = fields_to_fetch.begin(); + for (; ite != fields_to_fetch.end(); ite ++) { + rdc_gpu_field_t f[MAX_NUM_FIELDS]; + std::copy(ite->second.begin(), ite->second.end(), f); + ite->first->rdc_telemetry_fields_value_get(f, + ite->second.size(), callback, user_data); + } + + // Notify the caller unsupported fields + callback(&unsupport_fields[0], unsupport_fields.size(), user_data); + + return RDC_ST_OK; +} + +} // namespace rdc +} // namespace amd + diff --git a/rdc_libs/rdc/src/RdcWatchTableImpl.cc b/rdc_libs/rdc/src/RdcWatchTableImpl.cc index f019395101..1d2d82df9d 100644 --- a/rdc_libs/rdc/src/RdcWatchTableImpl.cc +++ b/rdc_libs/rdc/src/RdcWatchTableImpl.cc @@ -38,10 +38,12 @@ namespace rdc { RdcWatchTableImpl::RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings, const RdcCacheManagerPtr& cache_mgr, - const RdcMetricFetcherPtr& metric_fetcher): + const RdcMetricFetcherPtr& metric_fetcher, + const RdcModuleMgrPtr& module_mgr): group_settings_(group_settings) , cache_mgr_(cache_mgr) , metric_fetcher_(metric_fetcher) + , rdc_module_mgr_(module_mgr) , last_cleanup_time_(0) { } @@ -213,6 +215,11 @@ rdc_status_t RdcWatchTableImpl::rdc_field_watch(rdc_gpu_group_t group_id, auto f_in_watch_iter = fields_in_watch.begin(); for (; f_in_watch_iter != fields_in_watch.end(); f_in_watch_iter++) { + // Skip not support fields + result = metric_fetcher_->acquire_rsmi_handle(*f_in_watch_iter); + if (result != RDC_ST_OK) { + continue; + } auto ite = fields_to_watch_.find(*f_in_watch_iter); if (ite == fields_to_watch_.end()) { // A new field fields_to_watch_.insert({*f_in_watch_iter, f}); @@ -230,10 +237,6 @@ rdc_status_t RdcWatchTableImpl::rdc_field_watch(rdc_gpu_group_t group_id, f_in_table.update_freq = update_freq; } } - result = metric_fetcher_->acquire_rsmi_handle(*f_in_watch_iter); - if (result != RDC_ST_OK && result != RDC_ST_ALREADY_EXIST) { - return result; - } } // Add to the watch table @@ -351,14 +354,53 @@ bool RdcWatchTableImpl::is_job_watch_field(uint32_t gpu_index, return false; } -rdc_status_t RdcWatchTableImpl::rdc_field_update_all() { - uint32_t items_fetched = 0; - rdc_status_t result; +rdc_status_t RdcWatchTableImpl::handle_fields(rdc_gpu_field_value_t* values, + uint32_t num_values, void* user_data) { + if (values == nullptr || user_data == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + RdcWatchTableImpl* watchTable = static_cast(user_data); + for (uint32_t i = 0; i < num_values; i++) { + auto gpu_index = values[i].gpu_index; + auto field_id = values[i].field_value.field_id; + + // Always Update the timestamp + auto ite = watchTable->fields_to_watch_.find({gpu_index, field_id}); + if (ite != watchTable->fields_to_watch_.end()) { + struct timeval tv; + gettimeofday(&tv, NULL); + uint64_t now = static_cast(tv.tv_sec) * 1000 + + tv.tv_usec / 1000; + ite->second.last_update_time = now; + } + + // Only cache valid results + if (values[i].field_value.status != RDC_ST_OK) { + continue; + } + + // Update the cache + watchTable->cache_mgr_->rdc_update_cache(gpu_index, + values[i].field_value); + + // Update the job stats cache + std::string job_id; + if (watchTable->is_job_watch_field(gpu_index, field_id, job_id)) { + watchTable->cache_mgr_->rdc_update_job_stats(gpu_index, + job_id, values[i].field_value); + } + } + return RDC_ST_OK; +} + +rdc_status_t RdcWatchTableImpl::rdc_field_update_all() { struct timeval tv; gettimeofday(&tv, NULL); uint64_t now = static_cast(tv.tv_sec)*1000+tv.tv_usec/1000; + // Collect all fields need to be updated for bulk fetch + std::vector fields; std::lock_guard guard(watch_mutex_); auto fite = fields_to_watch_.begin(); for (; fite != fields_to_watch_.end(); fite++) { @@ -368,35 +410,18 @@ rdc_status_t RdcWatchTableImpl::rdc_field_update_all() { fite->second.last_update_time+track_freq > now) { continue; } + fields.push_back({fite->first.first, fite->first.second}); + } - // Fetch the metric from rocm_smi_lib - rdc_field_value value; - result = metric_fetcher_->fetch_smi_field( - fite->first.first, fite->first.second, &value); - if (result != RDC_ST_OK) { - // To prevent frequently retry when error, update the time - gettimeofday(&tv, NULL); - now = static_cast(tv.tv_sec)*1000+tv.tv_usec/1000; - fite->second.last_update_time = now; - continue; + if (fields.size() != 0) { + auto rdc_telemetry = rdc_module_mgr_->get_telemetry_module(); + if (rdc_telemetry) { + rdc_telemetry->rdc_telemetry_fields_value_get(&fields[0], + fields.size(), RdcWatchTableImpl::handle_fields, this); + } else { + RDC_LOG(RDC_ERROR, + "RdcWatchTableImpl: Fail to get the telemetry module"); } - - // Update the cache - cache_mgr_->rdc_update_cache(fite->first.first, value); - - // Update the job stats cache - std::string job_id; - if (is_job_watch_field(fite->first.first, fite->first.second, job_id)) { - cache_mgr_->rdc_update_job_stats(fite->first.first, job_id, value); - } - - - // Update the last_upate_time - gettimeofday(&tv, NULL); - now = static_cast(tv.tv_sec)*1000+tv.tv_usec/1000; - fite->second.last_update_time = now; - - items_fetched++; } // Clean up is expensive, only do it once per second