From 8b1c8878343a9c0c8e126d029e4923d91c19fe34 Mon Sep 17 00:00:00 2001 From: Chris Freehill Date: Thu, 4 Feb 2021 08:17:48 -0600 Subject: [PATCH] Turn on/off DAC capabilities as needed Write access is required for some RSMI services. This change temporarily permits write access so configuration can be done, and then turns it off. To help with this, the ScopedCapability struct is introduced to provide scope limited access, helping to ensure a process is not left with extra capability, should an exception occur. Change-Id: I4978a1a688db935b8bfc27b3b537a0dd07959d3f [ROCm/rdc commit: 6b5aeaaa23d68960e335a0a75d485c683434600c] --- .../rdc_capabilities.cc} | 4 ++- .../rdc_capabilities.h} | 29 ++++++++++++++++--- projects/rdc/rdc_libs/CMakeLists.txt | 4 ++- .../rdc_libs/rdc/src/RdcMetricFetcherImpl.cc | 27 +++++++++++++++++ .../rdc_libs/rdc/src/RdcNotificationImpl.cc | 25 ++++++++++++++-- projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc | 8 ++++- projects/rdc/server/CMakeLists.txt | 2 +- projects/rdc/server/src/rdc_server_main.cc | 2 +- 8 files changed, 89 insertions(+), 12 deletions(-) rename projects/rdc/{server/src/rdc_server_utils.cc => common/rdc_capabilities.cc} (96%) rename projects/rdc/{server/include/rdc/rdc_server_utils.h => common/rdc_capabilities.h} (65%) diff --git a/projects/rdc/server/src/rdc_server_utils.cc b/projects/rdc/common/rdc_capabilities.cc similarity index 96% rename from projects/rdc/server/src/rdc_server_utils.cc rename to projects/rdc/common/rdc_capabilities.cc index 8a2675cba7..59caff17b0 100755 --- a/projects/rdc/server/src/rdc_server_utils.cc +++ b/projects/rdc/common/rdc_capabilities.cc @@ -1,5 +1,5 @@ /* -Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -24,6 +24,8 @@ THE SOFTWARE. #include #include +#include "common/rdc_capabilities.h" + namespace amd { namespace rdc { diff --git a/projects/rdc/server/include/rdc/rdc_server_utils.h b/projects/rdc/common/rdc_capabilities.h similarity index 65% rename from projects/rdc/server/include/rdc/rdc_server_utils.h rename to projects/rdc/common/rdc_capabilities.h index 725133245a..30dd6e0d4e 100755 --- a/projects/rdc/server/include/rdc/rdc_server_utils.h +++ b/projects/rdc/common/rdc_capabilities.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -19,8 +19,9 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef SERVER_INCLUDE_RDC_RDC_SERVER_UTILS_H_ -#define SERVER_INCLUDE_RDC_RDC_SERVER_UTILS_H_ + +#ifndef COMMON_RDC_CAPABILITIES_H_ +#define COMMON_RDC_CAPABILITIES_H_ #include @@ -29,7 +30,27 @@ namespace rdc { int GetCapability(cap_value_t cap, cap_flag_t cap_type, bool *enabled); int ModifyCapability(cap_value_t cap, cap_flag_t cap_type, bool enable); + +struct ScopedCapability { + ScopedCapability(cap_value_t cp, cap_flag_t cpt) : + cap_(cp), cap_type_(cpt), error_(0) { + error_ = ModifyCapability(cap_, cap_type_, true); + } + ~ScopedCapability() { + error_ = ModifyCapability(cap_, cap_type_, false); + } + void Relinquish(void) { + error_ = ModifyCapability(cap_, cap_type_, false); + } + int error(void) {return error_;} + private: + cap_value_t cap_; + cap_flag_t cap_type_; + int error_; +}; + } // namespace rdc } // namespace amd -#endif // SERVER_INCLUDE_RDC_RDC_SERVER_UTILS_H_ +#endif // COMMON_RDC_CAPABILITIES_H_ + diff --git a/projects/rdc/rdc_libs/CMakeLists.txt b/projects/rdc/rdc_libs/CMakeLists.txt index bd25292c27..3afdd7c2bf 100755 --- a/projects/rdc/rdc_libs/CMakeLists.txt +++ b/projects/rdc/rdc_libs/CMakeLists.txt @@ -163,6 +163,7 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcModuleMgrImpl.cc set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcNotificationImpl.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RsmiUtils.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${COMMON_DIR}/rdc_fields_supported.cc") +set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${COMMON_DIR}/rdc_capabilities.cc") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcEmbeddedHandler.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcMetricFetcher.h") @@ -185,12 +186,13 @@ set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcNotifica set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcNotificationImpl.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RsmiUtils.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${COMMON_DIR}/rdc_fields_supported.h") +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${COMMON_DIR}/rdc_capabilities.h") message("RDC_LIB_INC_LIST=${RDC_LIB_INC_LIST}") link_directories(${RSMI_LIB_DIR} "${GRPC_ROOT}/lib" "${GRPC_ROOT}/lib64") add_library(${RDC_LIB} SHARED ${RDC_LIB_SRC_LIST} ${RDC_LIB_INC_LIST}) -target_link_libraries(${RDC_LIB} ${BOOTSTRAP_LIB} pthread rocm_smi64) +target_link_libraries(${RDC_LIB} ${BOOTSTRAP_LIB} pthread rocm_smi64 cap) target_include_directories(${RDC_LIB} PRIVATE "${PROJECT_SOURCE_DIR}" "${PROJECT_SOURCE_DIR}/include" diff --git a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index 31fff6d76a..c5fa2471b5 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -33,6 +33,7 @@ THE SOFTWARE. #include "rdc_lib/RdcLogger.h" #include "rocm_smi/rocm_smi.h" #include "rdc_lib/impl/RsmiUtils.h" +#include "common/rdc_capabilities.h" namespace amd { namespace rdc { @@ -523,12 +524,32 @@ static rdc_status_t init_rsmi_counter(RdcFieldKey fk, } rsmi_event_type_t evt = rdc_evnt_2_rsmi_field.at(f); + + // Temporarily get DAC capability + ScopedCapability sc(CAP_DAC_OVERRIDE, CAP_EFFECTIVE); + + if (sc.error()) { + RDC_LOG(RDC_ERROR, + "Failed to acquire required capabilities. Errno " << sc.error()); + return RDC_ST_PERM_ERROR; + } + ret = rsmi_dev_counter_create(dv_ind, evt, handle); if (ret != RSMI_STATUS_SUCCESS) { return Rsmi2RdcError(ret); } ret = rsmi_counter_control(*handle, RSMI_CNTR_CMD_START, nullptr); + + // Release DAC capability + sc.Relinquish(); + + if (sc.error()) { + RDC_LOG(RDC_ERROR, + "Failed to relinquish capabilities. Errno " << sc.error()); + return RDC_ST_PERM_ERROR; + } + return Rsmi2RdcError(ret); } @@ -561,6 +582,9 @@ rdc_status_t RdcMetricFetcherImpl::delete_rsmi_handle(RdcFieldKey fk) { ret = rsmi_counter_control(h, RSMI_CNTR_CMD_STOP, nullptr); if (ret != RSMI_STATUS_SUCCESS) { rsmi_data_.erase(fk); + + RDC_LOG(RDC_ERROR, "Error in stopping event counter: " << + Rsmi2RdcError(ret)); return Rsmi2RdcError(ret); } @@ -590,6 +614,7 @@ rdc_status_t RdcMetricFetcherImpl::acquire_rsmi_handle(RdcFieldKey fk) { result = init_rsmi_counter(fk, grp, &handle); if (result != RDC_ST_OK) { + RDC_LOG(RDC_ERROR, "Failed to init RSMI counter. Return:" << result); return result; } auto fsh = std::shared_ptr(new FieldRSMIData); @@ -636,6 +661,8 @@ rdc_status_t RdcMetricFetcherImpl::acquire_rsmi_handle(RdcFieldKey fk) { RDC_LOG(RDC_ERROR, "No event counters are available for " << field_id_to_descript.at(fk.second).enum_name << " event."); + } else if (ret != RDC_ST_OK) { + RDC_LOG(RDC_ERROR, "Error in getting event counter handle: " << ret); } return ret; } diff --git a/projects/rdc/rdc_libs/rdc/src/RdcNotificationImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcNotificationImpl.cc index 52d2e38235..2686d0e07e 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcNotificationImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcNotificationImpl.cc @@ -34,6 +34,7 @@ THE SOFTWARE. #include "rdc_lib/RdcLogger.h" #include "rdc_lib/impl/RdcSmiLib.h" #include "rocm_smi/rocm_smi.h" +#include "common/rdc_capabilities.h" namespace amd { namespace rdc { @@ -95,9 +96,19 @@ RdcNotificationImpl::set_listen_events(const std::vector fk_arr) { // No change to mask; nothing to be done continue; } + + // Temporarily get DAC capability + ScopedCapability sc(CAP_DAC_OVERRIDE, CAP_EFFECTIVE); + + if (sc.error()) { + RDC_LOG(RDC_ERROR, + "Failed to acquire required capabilities. Errno " << sc.error()); + return RDC_ST_PERM_ERROR; + } + ret = rsmi_event_notification_init(it->first); if (ret != RSMI_STATUS_SUCCESS) { - RDC_LOG(RDC_INFO, + RDC_LOG(RDC_ERROR, "rsmi_event_notification_init() returned " << ret << " for device " << it->first << ". " << std::endl << " Will not listen for events on this device"); @@ -105,6 +116,14 @@ RdcNotificationImpl::set_listen_events(const std::vector fk_arr) { } ret = rsmi_event_notification_mask_set(it->first, it->second); + // Release DAC capability + sc.Relinquish(); + + if (sc.error()) { + RDC_LOG(RDC_ERROR, + "Failed to relinquish capabilities. Errno " << sc.error()); + return RDC_ST_PERM_ERROR; + } if (ret == RSMI_STATUS_SUCCESS) { gpu_evnt_notif_masks_[it->first] = it->second; @@ -162,7 +181,7 @@ RdcNotificationImpl::stop_listening(uint32_t gpu_id) { ret = rsmi_event_notification_mask_set(gpu_id, 0); if (ret != RSMI_STATUS_SUCCESS) { - RDC_LOG(RDC_INFO, "rsmi_event_notification_mask_set() returned " << ret + RDC_LOG(RDC_ERROR, "rsmi_event_notification_mask_set() returned " << ret << " for device " << gpu_id); } @@ -171,7 +190,7 @@ RdcNotificationImpl::stop_listening(uint32_t gpu_id) { std::lock_guard guard(notif_mutex_); gpu_evnt_notif_masks_[gpu_id] = 0; } else { - RDC_LOG(RDC_INFO, "rsmi_event_notification_stop() returned " << ret + RDC_LOG(RDC_ERROR, "rsmi_event_notification_stop() returned " << ret << " for device " << gpu_id); } return RDC_ST_OK; diff --git a/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc b/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc index 2580205d15..01e4d5b444 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc @@ -98,13 +98,19 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, rdc_status_t RdcSmiLib::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, uint32_t fields_count) { + rdc_status_t ret; + if (fields == nullptr) { return RDC_ST_BAD_PARAMETER; } for (uint32_t i = 0; i < fields_count; i++) { - metric_fetcher_->acquire_rsmi_handle( + ret = metric_fetcher_->acquire_rsmi_handle( {fields[i].gpu_index, fields[i].field_id}); + if (ret != RDC_ST_OK) { + RDC_LOG(RDC_ERROR, + "Failed to acquire rocm_smi handle for field."); + } } RDC_LOG(RDC_DEBUG, "acquire " << fields_count << " field handles from rocm_smi_lib"); diff --git a/projects/rdc/server/CMakeLists.txt b/projects/rdc/server/CMakeLists.txt index e770a4e781..5536203400 100755 --- a/projects/rdc/server/CMakeLists.txt +++ b/projects/rdc/server/CMakeLists.txt @@ -75,9 +75,9 @@ set(SERVER_SRC_LIST "${SRC_DIR}/rdc_rsmi_service.cc") set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${SRC_DIR}/rdc_admin_service.cc") set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${SRC_DIR}/rdc_api_service.cc") set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${SRC_DIR}/rdc_server_main.cc") -set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${SRC_DIR}/rdc_server_utils.cc") set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${PROTOBUF_GENERATED_SRCS}") set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${RDC_SRC_ROOT}/common/rdc_utils.cc") +set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${RDC_SRC_ROOT}/common/rdc_capabilities.cc") message("SERVER_SRC_LIST=${SERVER_SRC_LIST}") set(SERVER_DAEMON_EXE "rdcd") diff --git a/projects/rdc/server/src/rdc_server_main.cc b/projects/rdc/server/src/rdc_server_main.cc index aaf67f54b5..42041f50be 100755 --- a/projects/rdc/server/src/rdc_server_main.cc +++ b/projects/rdc/server/src/rdc_server_main.cc @@ -41,7 +41,7 @@ THE SOFTWARE. #include "rdc/rdc_server_main.h" #include "rdc/rdc_rsmi_service.h" #include "rdc/rdc_api_service.h" -#include "rdc/rdc_server_utils.h" +#include "common/rdc_capabilities.h" #include "common/rdc_utils.h" // TODO(cfreehil):