diff --git a/projects/rdc/server/src/rdc_server_utils.cc b/projects/rdc/common/rdc_capabilities.cc similarity index 96% rename from projects/rdc/server/src/rdc_server_utils.cc rename to projects/rdc/common/rdc_capabilities.cc index 8a2675cba7..59caff17b0 100755 --- a/projects/rdc/server/src/rdc_server_utils.cc +++ b/projects/rdc/common/rdc_capabilities.cc @@ -1,5 +1,5 @@ /* -Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -24,6 +24,8 @@ THE SOFTWARE. #include #include +#include "common/rdc_capabilities.h" + namespace amd { namespace rdc { diff --git a/projects/rdc/server/include/rdc/rdc_server_utils.h b/projects/rdc/common/rdc_capabilities.h similarity index 65% rename from projects/rdc/server/include/rdc/rdc_server_utils.h rename to projects/rdc/common/rdc_capabilities.h index 725133245a..30dd6e0d4e 100755 --- a/projects/rdc/server/include/rdc/rdc_server_utils.h +++ b/projects/rdc/common/rdc_capabilities.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -19,8 +19,9 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef SERVER_INCLUDE_RDC_RDC_SERVER_UTILS_H_ -#define SERVER_INCLUDE_RDC_RDC_SERVER_UTILS_H_ + +#ifndef COMMON_RDC_CAPABILITIES_H_ +#define COMMON_RDC_CAPABILITIES_H_ #include @@ -29,7 +30,27 @@ namespace rdc { int GetCapability(cap_value_t cap, cap_flag_t cap_type, bool *enabled); int ModifyCapability(cap_value_t cap, cap_flag_t cap_type, bool enable); + +struct ScopedCapability { + ScopedCapability(cap_value_t cp, cap_flag_t cpt) : + cap_(cp), cap_type_(cpt), error_(0) { + error_ = ModifyCapability(cap_, cap_type_, true); + } + ~ScopedCapability() { + error_ = ModifyCapability(cap_, cap_type_, false); + } + void Relinquish(void) { + error_ = ModifyCapability(cap_, cap_type_, false); + } + int error(void) {return error_;} + private: + cap_value_t cap_; + cap_flag_t cap_type_; + int error_; +}; + } // namespace rdc } // namespace amd -#endif // SERVER_INCLUDE_RDC_RDC_SERVER_UTILS_H_ +#endif // COMMON_RDC_CAPABILITIES_H_ + diff --git a/projects/rdc/rdc_libs/CMakeLists.txt b/projects/rdc/rdc_libs/CMakeLists.txt index bd25292c27..3afdd7c2bf 100755 --- a/projects/rdc/rdc_libs/CMakeLists.txt +++ b/projects/rdc/rdc_libs/CMakeLists.txt @@ -163,6 +163,7 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcModuleMgrImpl.cc set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcNotificationImpl.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RsmiUtils.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${COMMON_DIR}/rdc_fields_supported.cc") +set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${COMMON_DIR}/rdc_capabilities.cc") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcEmbeddedHandler.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcMetricFetcher.h") @@ -185,12 +186,13 @@ set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcNotifica set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcNotificationImpl.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RsmiUtils.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${COMMON_DIR}/rdc_fields_supported.h") +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${COMMON_DIR}/rdc_capabilities.h") message("RDC_LIB_INC_LIST=${RDC_LIB_INC_LIST}") link_directories(${RSMI_LIB_DIR} "${GRPC_ROOT}/lib" "${GRPC_ROOT}/lib64") add_library(${RDC_LIB} SHARED ${RDC_LIB_SRC_LIST} ${RDC_LIB_INC_LIST}) -target_link_libraries(${RDC_LIB} ${BOOTSTRAP_LIB} pthread rocm_smi64) +target_link_libraries(${RDC_LIB} ${BOOTSTRAP_LIB} pthread rocm_smi64 cap) target_include_directories(${RDC_LIB} PRIVATE "${PROJECT_SOURCE_DIR}" "${PROJECT_SOURCE_DIR}/include" diff --git a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index 31fff6d76a..c5fa2471b5 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -33,6 +33,7 @@ THE SOFTWARE. #include "rdc_lib/RdcLogger.h" #include "rocm_smi/rocm_smi.h" #include "rdc_lib/impl/RsmiUtils.h" +#include "common/rdc_capabilities.h" namespace amd { namespace rdc { @@ -523,12 +524,32 @@ static rdc_status_t init_rsmi_counter(RdcFieldKey fk, } rsmi_event_type_t evt = rdc_evnt_2_rsmi_field.at(f); + + // Temporarily get DAC capability + ScopedCapability sc(CAP_DAC_OVERRIDE, CAP_EFFECTIVE); + + if (sc.error()) { + RDC_LOG(RDC_ERROR, + "Failed to acquire required capabilities. Errno " << sc.error()); + return RDC_ST_PERM_ERROR; + } + ret = rsmi_dev_counter_create(dv_ind, evt, handle); if (ret != RSMI_STATUS_SUCCESS) { return Rsmi2RdcError(ret); } ret = rsmi_counter_control(*handle, RSMI_CNTR_CMD_START, nullptr); + + // Release DAC capability + sc.Relinquish(); + + if (sc.error()) { + RDC_LOG(RDC_ERROR, + "Failed to relinquish capabilities. Errno " << sc.error()); + return RDC_ST_PERM_ERROR; + } + return Rsmi2RdcError(ret); } @@ -561,6 +582,9 @@ rdc_status_t RdcMetricFetcherImpl::delete_rsmi_handle(RdcFieldKey fk) { ret = rsmi_counter_control(h, RSMI_CNTR_CMD_STOP, nullptr); if (ret != RSMI_STATUS_SUCCESS) { rsmi_data_.erase(fk); + + RDC_LOG(RDC_ERROR, "Error in stopping event counter: " << + Rsmi2RdcError(ret)); return Rsmi2RdcError(ret); } @@ -590,6 +614,7 @@ rdc_status_t RdcMetricFetcherImpl::acquire_rsmi_handle(RdcFieldKey fk) { result = init_rsmi_counter(fk, grp, &handle); if (result != RDC_ST_OK) { + RDC_LOG(RDC_ERROR, "Failed to init RSMI counter. Return:" << result); return result; } auto fsh = std::shared_ptr(new FieldRSMIData); @@ -636,6 +661,8 @@ rdc_status_t RdcMetricFetcherImpl::acquire_rsmi_handle(RdcFieldKey fk) { RDC_LOG(RDC_ERROR, "No event counters are available for " << field_id_to_descript.at(fk.second).enum_name << " event."); + } else if (ret != RDC_ST_OK) { + RDC_LOG(RDC_ERROR, "Error in getting event counter handle: " << ret); } return ret; } diff --git a/projects/rdc/rdc_libs/rdc/src/RdcNotificationImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcNotificationImpl.cc index 52d2e38235..2686d0e07e 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcNotificationImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcNotificationImpl.cc @@ -34,6 +34,7 @@ THE SOFTWARE. #include "rdc_lib/RdcLogger.h" #include "rdc_lib/impl/RdcSmiLib.h" #include "rocm_smi/rocm_smi.h" +#include "common/rdc_capabilities.h" namespace amd { namespace rdc { @@ -95,9 +96,19 @@ RdcNotificationImpl::set_listen_events(const std::vector fk_arr) { // No change to mask; nothing to be done continue; } + + // Temporarily get DAC capability + ScopedCapability sc(CAP_DAC_OVERRIDE, CAP_EFFECTIVE); + + if (sc.error()) { + RDC_LOG(RDC_ERROR, + "Failed to acquire required capabilities. Errno " << sc.error()); + return RDC_ST_PERM_ERROR; + } + ret = rsmi_event_notification_init(it->first); if (ret != RSMI_STATUS_SUCCESS) { - RDC_LOG(RDC_INFO, + RDC_LOG(RDC_ERROR, "rsmi_event_notification_init() returned " << ret << " for device " << it->first << ". " << std::endl << " Will not listen for events on this device"); @@ -105,6 +116,14 @@ RdcNotificationImpl::set_listen_events(const std::vector fk_arr) { } ret = rsmi_event_notification_mask_set(it->first, it->second); + // Release DAC capability + sc.Relinquish(); + + if (sc.error()) { + RDC_LOG(RDC_ERROR, + "Failed to relinquish capabilities. Errno " << sc.error()); + return RDC_ST_PERM_ERROR; + } if (ret == RSMI_STATUS_SUCCESS) { gpu_evnt_notif_masks_[it->first] = it->second; @@ -162,7 +181,7 @@ RdcNotificationImpl::stop_listening(uint32_t gpu_id) { ret = rsmi_event_notification_mask_set(gpu_id, 0); if (ret != RSMI_STATUS_SUCCESS) { - RDC_LOG(RDC_INFO, "rsmi_event_notification_mask_set() returned " << ret + RDC_LOG(RDC_ERROR, "rsmi_event_notification_mask_set() returned " << ret << " for device " << gpu_id); } @@ -171,7 +190,7 @@ RdcNotificationImpl::stop_listening(uint32_t gpu_id) { std::lock_guard guard(notif_mutex_); gpu_evnt_notif_masks_[gpu_id] = 0; } else { - RDC_LOG(RDC_INFO, "rsmi_event_notification_stop() returned " << ret + RDC_LOG(RDC_ERROR, "rsmi_event_notification_stop() returned " << ret << " for device " << gpu_id); } return RDC_ST_OK; diff --git a/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc b/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc index 2580205d15..01e4d5b444 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc @@ -98,13 +98,19 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, rdc_status_t RdcSmiLib::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, uint32_t fields_count) { + rdc_status_t ret; + if (fields == nullptr) { return RDC_ST_BAD_PARAMETER; } for (uint32_t i = 0; i < fields_count; i++) { - metric_fetcher_->acquire_rsmi_handle( + ret = metric_fetcher_->acquire_rsmi_handle( {fields[i].gpu_index, fields[i].field_id}); + if (ret != RDC_ST_OK) { + RDC_LOG(RDC_ERROR, + "Failed to acquire rocm_smi handle for field."); + } } RDC_LOG(RDC_DEBUG, "acquire " << fields_count << " field handles from rocm_smi_lib"); diff --git a/projects/rdc/server/CMakeLists.txt b/projects/rdc/server/CMakeLists.txt index e770a4e781..5536203400 100755 --- a/projects/rdc/server/CMakeLists.txt +++ b/projects/rdc/server/CMakeLists.txt @@ -75,9 +75,9 @@ set(SERVER_SRC_LIST "${SRC_DIR}/rdc_rsmi_service.cc") set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${SRC_DIR}/rdc_admin_service.cc") set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${SRC_DIR}/rdc_api_service.cc") set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${SRC_DIR}/rdc_server_main.cc") -set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${SRC_DIR}/rdc_server_utils.cc") set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${PROTOBUF_GENERATED_SRCS}") set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${RDC_SRC_ROOT}/common/rdc_utils.cc") +set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${RDC_SRC_ROOT}/common/rdc_capabilities.cc") message("SERVER_SRC_LIST=${SERVER_SRC_LIST}") set(SERVER_DAEMON_EXE "rdcd") diff --git a/projects/rdc/server/src/rdc_server_main.cc b/projects/rdc/server/src/rdc_server_main.cc index aaf67f54b5..42041f50be 100755 --- a/projects/rdc/server/src/rdc_server_main.cc +++ b/projects/rdc/server/src/rdc_server_main.cc @@ -41,7 +41,7 @@ THE SOFTWARE. #include "rdc/rdc_server_main.h" #include "rdc/rdc_rsmi_service.h" #include "rdc/rdc_api_service.h" -#include "rdc/rdc_server_utils.h" +#include "common/rdc_capabilities.h" #include "common/rdc_utils.h" // TODO(cfreehil):