Turn on/off DAC capabilities as needed

Write access is required for some RSMI services. This change
temporarily permits write access so configuration can be done,
and then turns it off.

To help with this, the ScopedCapability struct is introduced to
provide scope limited access, helping to ensure a process is not
left with extra capability, should an exception occur.

Change-Id: I4978a1a688db935b8bfc27b3b537a0dd07959d3f
Este commit está contenido en:
Chris Freehill
2021-02-04 08:17:48 -06:00
padre a9d0e037b5
commit 6b5aeaaa23
Se han modificado 8 ficheros con 89 adiciones y 12 borrados
@@ -1,5 +1,5 @@
/*
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -24,6 +24,8 @@ THE SOFTWARE.
#include <errno.h>
#include <assert.h>
#include "common/rdc_capabilities.h"
namespace amd {
namespace rdc {
@@ -1,5 +1,5 @@
/*
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -19,8 +19,9 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef SERVER_INCLUDE_RDC_RDC_SERVER_UTILS_H_
#define SERVER_INCLUDE_RDC_RDC_SERVER_UTILS_H_
#ifndef COMMON_RDC_CAPABILITIES_H_
#define COMMON_RDC_CAPABILITIES_H_
#include <sys/capability.h>
@@ -29,7 +30,27 @@ namespace rdc {
int GetCapability(cap_value_t cap, cap_flag_t cap_type, bool *enabled);
int ModifyCapability(cap_value_t cap, cap_flag_t cap_type, bool enable);
struct ScopedCapability {
ScopedCapability(cap_value_t cp, cap_flag_t cpt) :
cap_(cp), cap_type_(cpt), error_(0) {
error_ = ModifyCapability(cap_, cap_type_, true);
}
~ScopedCapability() {
error_ = ModifyCapability(cap_, cap_type_, false);
}
void Relinquish(void) {
error_ = ModifyCapability(cap_, cap_type_, false);
}
int error(void) {return error_;}
private:
cap_value_t cap_;
cap_flag_t cap_type_;
int error_;
};
} // namespace rdc
} // namespace amd
#endif // SERVER_INCLUDE_RDC_RDC_SERVER_UTILS_H_
#endif // COMMON_RDC_CAPABILITIES_H_
+3 -1
Ver fichero
@@ -163,6 +163,7 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcModuleMgrImpl.cc
set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcNotificationImpl.cc")
set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RsmiUtils.cc")
set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${COMMON_DIR}/rdc_fields_supported.cc")
set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${COMMON_DIR}/rdc_capabilities.cc")
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcEmbeddedHandler.h")
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcMetricFetcher.h")
@@ -185,12 +186,13 @@ set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcNotifica
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcNotificationImpl.h")
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RsmiUtils.h")
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${COMMON_DIR}/rdc_fields_supported.h")
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${COMMON_DIR}/rdc_capabilities.h")
message("RDC_LIB_INC_LIST=${RDC_LIB_INC_LIST}")
link_directories(${RSMI_LIB_DIR} "${GRPC_ROOT}/lib" "${GRPC_ROOT}/lib64")
add_library(${RDC_LIB} SHARED ${RDC_LIB_SRC_LIST} ${RDC_LIB_INC_LIST})
target_link_libraries(${RDC_LIB} ${BOOTSTRAP_LIB} pthread rocm_smi64)
target_link_libraries(${RDC_LIB} ${BOOTSTRAP_LIB} pthread rocm_smi64 cap)
target_include_directories(${RDC_LIB} PRIVATE
"${PROJECT_SOURCE_DIR}"
"${PROJECT_SOURCE_DIR}/include"
+27
Ver fichero
@@ -33,6 +33,7 @@ THE SOFTWARE.
#include "rdc_lib/RdcLogger.h"
#include "rocm_smi/rocm_smi.h"
#include "rdc_lib/impl/RsmiUtils.h"
#include "common/rdc_capabilities.h"
namespace amd {
namespace rdc {
@@ -523,12 +524,32 @@ static rdc_status_t init_rsmi_counter(RdcFieldKey fk,
}
rsmi_event_type_t evt = rdc_evnt_2_rsmi_field.at(f);
// Temporarily get DAC capability
ScopedCapability sc(CAP_DAC_OVERRIDE, CAP_EFFECTIVE);
if (sc.error()) {
RDC_LOG(RDC_ERROR,
"Failed to acquire required capabilities. Errno " << sc.error());
return RDC_ST_PERM_ERROR;
}
ret = rsmi_dev_counter_create(dv_ind, evt, handle);
if (ret != RSMI_STATUS_SUCCESS) {
return Rsmi2RdcError(ret);
}
ret = rsmi_counter_control(*handle, RSMI_CNTR_CMD_START, nullptr);
// Release DAC capability
sc.Relinquish();
if (sc.error()) {
RDC_LOG(RDC_ERROR,
"Failed to relinquish capabilities. Errno " << sc.error());
return RDC_ST_PERM_ERROR;
}
return Rsmi2RdcError(ret);
}
@@ -561,6 +582,9 @@ rdc_status_t RdcMetricFetcherImpl::delete_rsmi_handle(RdcFieldKey fk) {
ret = rsmi_counter_control(h, RSMI_CNTR_CMD_STOP, nullptr);
if (ret != RSMI_STATUS_SUCCESS) {
rsmi_data_.erase(fk);
RDC_LOG(RDC_ERROR, "Error in stopping event counter: " <<
Rsmi2RdcError(ret));
return Rsmi2RdcError(ret);
}
@@ -590,6 +614,7 @@ rdc_status_t RdcMetricFetcherImpl::acquire_rsmi_handle(RdcFieldKey fk) {
result = init_rsmi_counter(fk, grp, &handle);
if (result != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Failed to init RSMI counter. Return:" << result);
return result;
}
auto fsh = std::shared_ptr<FieldRSMIData>(new FieldRSMIData);
@@ -636,6 +661,8 @@ rdc_status_t RdcMetricFetcherImpl::acquire_rsmi_handle(RdcFieldKey fk) {
RDC_LOG(RDC_ERROR, "No event counters are available for " <<
field_id_to_descript.at(fk.second).enum_name << " event.");
} else if (ret != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Error in getting event counter handle: " << ret);
}
return ret;
}
+22 -3
Ver fichero
@@ -34,6 +34,7 @@ THE SOFTWARE.
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/impl/RdcSmiLib.h"
#include "rocm_smi/rocm_smi.h"
#include "common/rdc_capabilities.h"
namespace amd {
namespace rdc {
@@ -95,9 +96,19 @@ RdcNotificationImpl::set_listen_events(const std::vector<RdcFieldKey> fk_arr) {
// No change to mask; nothing to be done
continue;
}
// Temporarily get DAC capability
ScopedCapability sc(CAP_DAC_OVERRIDE, CAP_EFFECTIVE);
if (sc.error()) {
RDC_LOG(RDC_ERROR,
"Failed to acquire required capabilities. Errno " << sc.error());
return RDC_ST_PERM_ERROR;
}
ret = rsmi_event_notification_init(it->first);
if (ret != RSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_INFO,
RDC_LOG(RDC_ERROR,
"rsmi_event_notification_init() returned " << ret << " for device " <<
it->first << ". " << std::endl <<
" Will not listen for events on this device");
@@ -105,6 +116,14 @@ RdcNotificationImpl::set_listen_events(const std::vector<RdcFieldKey> fk_arr) {
}
ret = rsmi_event_notification_mask_set(it->first, it->second);
// Release DAC capability
sc.Relinquish();
if (sc.error()) {
RDC_LOG(RDC_ERROR,
"Failed to relinquish capabilities. Errno " << sc.error());
return RDC_ST_PERM_ERROR;
}
if (ret == RSMI_STATUS_SUCCESS) {
gpu_evnt_notif_masks_[it->first] = it->second;
@@ -162,7 +181,7 @@ RdcNotificationImpl::stop_listening(uint32_t gpu_id) {
ret = rsmi_event_notification_mask_set(gpu_id, 0);
if (ret != RSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_INFO, "rsmi_event_notification_mask_set() returned " << ret
RDC_LOG(RDC_ERROR, "rsmi_event_notification_mask_set() returned " << ret
<< " for device " << gpu_id);
}
@@ -171,7 +190,7 @@ RdcNotificationImpl::stop_listening(uint32_t gpu_id) {
std::lock_guard<std::mutex> guard(notif_mutex_);
gpu_evnt_notif_masks_[gpu_id] = 0;
} else {
RDC_LOG(RDC_INFO, "rsmi_event_notification_stop() returned " << ret
RDC_LOG(RDC_ERROR, "rsmi_event_notification_stop() returned " << ret
<< " for device " << gpu_id);
}
return RDC_ST_OK;
+7 -1
Ver fichero
@@ -98,13 +98,19 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields,
rdc_status_t RdcSmiLib::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields,
uint32_t fields_count) {
rdc_status_t ret;
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
for (uint32_t i = 0; i < fields_count; i++) {
metric_fetcher_->acquire_rsmi_handle(
ret = metric_fetcher_->acquire_rsmi_handle(
{fields[i].gpu_index, fields[i].field_id});
if (ret != RDC_ST_OK) {
RDC_LOG(RDC_ERROR,
"Failed to acquire rocm_smi handle for field.");
}
}
RDC_LOG(RDC_DEBUG, "acquire " << fields_count
<< " field handles from rocm_smi_lib");
+1 -1
Ver fichero
@@ -75,9 +75,9 @@ set(SERVER_SRC_LIST "${SRC_DIR}/rdc_rsmi_service.cc")
set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${SRC_DIR}/rdc_admin_service.cc")
set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${SRC_DIR}/rdc_api_service.cc")
set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${SRC_DIR}/rdc_server_main.cc")
set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${SRC_DIR}/rdc_server_utils.cc")
set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${PROTOBUF_GENERATED_SRCS}")
set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${RDC_SRC_ROOT}/common/rdc_utils.cc")
set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${RDC_SRC_ROOT}/common/rdc_capabilities.cc")
message("SERVER_SRC_LIST=${SERVER_SRC_LIST}")
set(SERVER_DAEMON_EXE "rdcd")
+1 -1
Ver fichero
@@ -41,7 +41,7 @@ THE SOFTWARE.
#include "rdc/rdc_server_main.h"
#include "rdc/rdc_rsmi_service.h"
#include "rdc/rdc_api_service.h"
#include "rdc/rdc_server_utils.h"
#include "common/rdc_capabilities.h"
#include "common/rdc_utils.h"
// TODO(cfreehil):