Turn on/off DAC capabilities as needed

Write access is required for some RSMI services. This change
temporarily permits write access so configuration can be done,
and then turns it off.

To help with this, the ScopedCapability struct is introduced to
provide scope limited access, helping to ensure a process is not
left with extra capability, should an exception occur.

Change-Id: I4978a1a688db935b8bfc27b3b537a0dd07959d3f


[ROCm/rdc commit: 6b5aeaaa23]
This commit is contained in:
Chris Freehill
2021-02-04 08:17:48 -06:00
parent 7cf47fb5c9
commit 8b1c887834
8 changed files with 89 additions and 12 deletions
@@ -1,5 +1,5 @@
/*
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -24,6 +24,8 @@ THE SOFTWARE.
#include <errno.h>
#include <assert.h>
#include "common/rdc_capabilities.h"
namespace amd {
namespace rdc {
@@ -1,5 +1,5 @@
/*
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -19,8 +19,9 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef SERVER_INCLUDE_RDC_RDC_SERVER_UTILS_H_
#define SERVER_INCLUDE_RDC_RDC_SERVER_UTILS_H_
#ifndef COMMON_RDC_CAPABILITIES_H_
#define COMMON_RDC_CAPABILITIES_H_
#include <sys/capability.h>
@@ -29,7 +30,27 @@ namespace rdc {
int GetCapability(cap_value_t cap, cap_flag_t cap_type, bool *enabled);
int ModifyCapability(cap_value_t cap, cap_flag_t cap_type, bool enable);
struct ScopedCapability {
ScopedCapability(cap_value_t cp, cap_flag_t cpt) :
cap_(cp), cap_type_(cpt), error_(0) {
error_ = ModifyCapability(cap_, cap_type_, true);
}
~ScopedCapability() {
error_ = ModifyCapability(cap_, cap_type_, false);
}
void Relinquish(void) {
error_ = ModifyCapability(cap_, cap_type_, false);
}
int error(void) {return error_;}
private:
cap_value_t cap_;
cap_flag_t cap_type_;
int error_;
};
} // namespace rdc
} // namespace amd
#endif // SERVER_INCLUDE_RDC_RDC_SERVER_UTILS_H_
#endif // COMMON_RDC_CAPABILITIES_H_
+3 -1
View File
@@ -163,6 +163,7 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcModuleMgrImpl.cc
set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcNotificationImpl.cc")
set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RsmiUtils.cc")
set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${COMMON_DIR}/rdc_fields_supported.cc")
set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${COMMON_DIR}/rdc_capabilities.cc")
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcEmbeddedHandler.h")
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcMetricFetcher.h")
@@ -185,12 +186,13 @@ set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcNotifica
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcNotificationImpl.h")
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RsmiUtils.h")
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${COMMON_DIR}/rdc_fields_supported.h")
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${COMMON_DIR}/rdc_capabilities.h")
message("RDC_LIB_INC_LIST=${RDC_LIB_INC_LIST}")
link_directories(${RSMI_LIB_DIR} "${GRPC_ROOT}/lib" "${GRPC_ROOT}/lib64")
add_library(${RDC_LIB} SHARED ${RDC_LIB_SRC_LIST} ${RDC_LIB_INC_LIST})
target_link_libraries(${RDC_LIB} ${BOOTSTRAP_LIB} pthread rocm_smi64)
target_link_libraries(${RDC_LIB} ${BOOTSTRAP_LIB} pthread rocm_smi64 cap)
target_include_directories(${RDC_LIB} PRIVATE
"${PROJECT_SOURCE_DIR}"
"${PROJECT_SOURCE_DIR}/include"
@@ -33,6 +33,7 @@ THE SOFTWARE.
#include "rdc_lib/RdcLogger.h"
#include "rocm_smi/rocm_smi.h"
#include "rdc_lib/impl/RsmiUtils.h"
#include "common/rdc_capabilities.h"
namespace amd {
namespace rdc {
@@ -523,12 +524,32 @@ static rdc_status_t init_rsmi_counter(RdcFieldKey fk,
}
rsmi_event_type_t evt = rdc_evnt_2_rsmi_field.at(f);
// Temporarily get DAC capability
ScopedCapability sc(CAP_DAC_OVERRIDE, CAP_EFFECTIVE);
if (sc.error()) {
RDC_LOG(RDC_ERROR,
"Failed to acquire required capabilities. Errno " << sc.error());
return RDC_ST_PERM_ERROR;
}
ret = rsmi_dev_counter_create(dv_ind, evt, handle);
if (ret != RSMI_STATUS_SUCCESS) {
return Rsmi2RdcError(ret);
}
ret = rsmi_counter_control(*handle, RSMI_CNTR_CMD_START, nullptr);
// Release DAC capability
sc.Relinquish();
if (sc.error()) {
RDC_LOG(RDC_ERROR,
"Failed to relinquish capabilities. Errno " << sc.error());
return RDC_ST_PERM_ERROR;
}
return Rsmi2RdcError(ret);
}
@@ -561,6 +582,9 @@ rdc_status_t RdcMetricFetcherImpl::delete_rsmi_handle(RdcFieldKey fk) {
ret = rsmi_counter_control(h, RSMI_CNTR_CMD_STOP, nullptr);
if (ret != RSMI_STATUS_SUCCESS) {
rsmi_data_.erase(fk);
RDC_LOG(RDC_ERROR, "Error in stopping event counter: " <<
Rsmi2RdcError(ret));
return Rsmi2RdcError(ret);
}
@@ -590,6 +614,7 @@ rdc_status_t RdcMetricFetcherImpl::acquire_rsmi_handle(RdcFieldKey fk) {
result = init_rsmi_counter(fk, grp, &handle);
if (result != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Failed to init RSMI counter. Return:" << result);
return result;
}
auto fsh = std::shared_ptr<FieldRSMIData>(new FieldRSMIData);
@@ -636,6 +661,8 @@ rdc_status_t RdcMetricFetcherImpl::acquire_rsmi_handle(RdcFieldKey fk) {
RDC_LOG(RDC_ERROR, "No event counters are available for " <<
field_id_to_descript.at(fk.second).enum_name << " event.");
} else if (ret != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Error in getting event counter handle: " << ret);
}
return ret;
}
@@ -34,6 +34,7 @@ THE SOFTWARE.
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/impl/RdcSmiLib.h"
#include "rocm_smi/rocm_smi.h"
#include "common/rdc_capabilities.h"
namespace amd {
namespace rdc {
@@ -95,9 +96,19 @@ RdcNotificationImpl::set_listen_events(const std::vector<RdcFieldKey> fk_arr) {
// No change to mask; nothing to be done
continue;
}
// Temporarily get DAC capability
ScopedCapability sc(CAP_DAC_OVERRIDE, CAP_EFFECTIVE);
if (sc.error()) {
RDC_LOG(RDC_ERROR,
"Failed to acquire required capabilities. Errno " << sc.error());
return RDC_ST_PERM_ERROR;
}
ret = rsmi_event_notification_init(it->first);
if (ret != RSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_INFO,
RDC_LOG(RDC_ERROR,
"rsmi_event_notification_init() returned " << ret << " for device " <<
it->first << ". " << std::endl <<
" Will not listen for events on this device");
@@ -105,6 +116,14 @@ RdcNotificationImpl::set_listen_events(const std::vector<RdcFieldKey> fk_arr) {
}
ret = rsmi_event_notification_mask_set(it->first, it->second);
// Release DAC capability
sc.Relinquish();
if (sc.error()) {
RDC_LOG(RDC_ERROR,
"Failed to relinquish capabilities. Errno " << sc.error());
return RDC_ST_PERM_ERROR;
}
if (ret == RSMI_STATUS_SUCCESS) {
gpu_evnt_notif_masks_[it->first] = it->second;
@@ -162,7 +181,7 @@ RdcNotificationImpl::stop_listening(uint32_t gpu_id) {
ret = rsmi_event_notification_mask_set(gpu_id, 0);
if (ret != RSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_INFO, "rsmi_event_notification_mask_set() returned " << ret
RDC_LOG(RDC_ERROR, "rsmi_event_notification_mask_set() returned " << ret
<< " for device " << gpu_id);
}
@@ -171,7 +190,7 @@ RdcNotificationImpl::stop_listening(uint32_t gpu_id) {
std::lock_guard<std::mutex> guard(notif_mutex_);
gpu_evnt_notif_masks_[gpu_id] = 0;
} else {
RDC_LOG(RDC_INFO, "rsmi_event_notification_stop() returned " << ret
RDC_LOG(RDC_ERROR, "rsmi_event_notification_stop() returned " << ret
<< " for device " << gpu_id);
}
return RDC_ST_OK;
+7 -1
View File
@@ -98,13 +98,19 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields,
rdc_status_t RdcSmiLib::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields,
uint32_t fields_count) {
rdc_status_t ret;
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
for (uint32_t i = 0; i < fields_count; i++) {
metric_fetcher_->acquire_rsmi_handle(
ret = metric_fetcher_->acquire_rsmi_handle(
{fields[i].gpu_index, fields[i].field_id});
if (ret != RDC_ST_OK) {
RDC_LOG(RDC_ERROR,
"Failed to acquire rocm_smi handle for field.");
}
}
RDC_LOG(RDC_DEBUG, "acquire " << fields_count
<< " field handles from rocm_smi_lib");
+1 -1
View File
@@ -75,9 +75,9 @@ set(SERVER_SRC_LIST "${SRC_DIR}/rdc_rsmi_service.cc")
set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${SRC_DIR}/rdc_admin_service.cc")
set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${SRC_DIR}/rdc_api_service.cc")
set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${SRC_DIR}/rdc_server_main.cc")
set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${SRC_DIR}/rdc_server_utils.cc")
set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${PROTOBUF_GENERATED_SRCS}")
set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${RDC_SRC_ROOT}/common/rdc_utils.cc")
set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${RDC_SRC_ROOT}/common/rdc_capabilities.cc")
message("SERVER_SRC_LIST=${SERVER_SRC_LIST}")
set(SERVER_DAEMON_EXE "rdcd")
+1 -1
View File
@@ -41,7 +41,7 @@ THE SOFTWARE.
#include "rdc/rdc_server_main.h"
#include "rdc/rdc_rsmi_service.h"
#include "rdc/rdc_api_service.h"
#include "rdc/rdc_server_utils.h"
#include "common/rdc_capabilities.h"
#include "common/rdc_utils.h"
// TODO(cfreehil):