Turn on/off DAC capabilities as needed
Write access is required for some RSMI services. This change
temporarily permits write access so configuration can be done,
and then turns it off.
To help with this, the ScopedCapability struct is introduced to
provide scope limited access, helping to ensure a process is not
left with extra capability, should an exception occur.
Change-Id: I4978a1a688db935b8bfc27b3b537a0dd07959d3f
[ROCm/rdc commit: 6b5aeaaa23]
This commit is contained in:
+3
-1
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -24,6 +24,8 @@ THE SOFTWARE.
|
||||
#include <errno.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "common/rdc_capabilities.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
+25
-4
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -19,8 +19,9 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#ifndef SERVER_INCLUDE_RDC_RDC_SERVER_UTILS_H_
|
||||
#define SERVER_INCLUDE_RDC_RDC_SERVER_UTILS_H_
|
||||
|
||||
#ifndef COMMON_RDC_CAPABILITIES_H_
|
||||
#define COMMON_RDC_CAPABILITIES_H_
|
||||
|
||||
#include <sys/capability.h>
|
||||
|
||||
@@ -29,7 +30,27 @@ namespace rdc {
|
||||
|
||||
int GetCapability(cap_value_t cap, cap_flag_t cap_type, bool *enabled);
|
||||
int ModifyCapability(cap_value_t cap, cap_flag_t cap_type, bool enable);
|
||||
|
||||
struct ScopedCapability {
|
||||
ScopedCapability(cap_value_t cp, cap_flag_t cpt) :
|
||||
cap_(cp), cap_type_(cpt), error_(0) {
|
||||
error_ = ModifyCapability(cap_, cap_type_, true);
|
||||
}
|
||||
~ScopedCapability() {
|
||||
error_ = ModifyCapability(cap_, cap_type_, false);
|
||||
}
|
||||
void Relinquish(void) {
|
||||
error_ = ModifyCapability(cap_, cap_type_, false);
|
||||
}
|
||||
int error(void) {return error_;}
|
||||
private:
|
||||
cap_value_t cap_;
|
||||
cap_flag_t cap_type_;
|
||||
int error_;
|
||||
};
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
#endif // SERVER_INCLUDE_RDC_RDC_SERVER_UTILS_H_
|
||||
#endif // COMMON_RDC_CAPABILITIES_H_
|
||||
|
||||
@@ -163,6 +163,7 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcModuleMgrImpl.cc
|
||||
set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcNotificationImpl.cc")
|
||||
set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RsmiUtils.cc")
|
||||
set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${COMMON_DIR}/rdc_fields_supported.cc")
|
||||
set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${COMMON_DIR}/rdc_capabilities.cc")
|
||||
|
||||
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcEmbeddedHandler.h")
|
||||
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcMetricFetcher.h")
|
||||
@@ -185,12 +186,13 @@ set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcNotifica
|
||||
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcNotificationImpl.h")
|
||||
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RsmiUtils.h")
|
||||
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${COMMON_DIR}/rdc_fields_supported.h")
|
||||
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${COMMON_DIR}/rdc_capabilities.h")
|
||||
|
||||
message("RDC_LIB_INC_LIST=${RDC_LIB_INC_LIST}")
|
||||
|
||||
link_directories(${RSMI_LIB_DIR} "${GRPC_ROOT}/lib" "${GRPC_ROOT}/lib64")
|
||||
add_library(${RDC_LIB} SHARED ${RDC_LIB_SRC_LIST} ${RDC_LIB_INC_LIST})
|
||||
target_link_libraries(${RDC_LIB} ${BOOTSTRAP_LIB} pthread rocm_smi64)
|
||||
target_link_libraries(${RDC_LIB} ${BOOTSTRAP_LIB} pthread rocm_smi64 cap)
|
||||
target_include_directories(${RDC_LIB} PRIVATE
|
||||
"${PROJECT_SOURCE_DIR}"
|
||||
"${PROJECT_SOURCE_DIR}/include"
|
||||
|
||||
@@ -33,6 +33,7 @@ THE SOFTWARE.
|
||||
#include "rdc_lib/RdcLogger.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "rdc_lib/impl/RsmiUtils.h"
|
||||
#include "common/rdc_capabilities.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
@@ -523,12 +524,32 @@ static rdc_status_t init_rsmi_counter(RdcFieldKey fk,
|
||||
}
|
||||
|
||||
rsmi_event_type_t evt = rdc_evnt_2_rsmi_field.at(f);
|
||||
|
||||
// Temporarily get DAC capability
|
||||
ScopedCapability sc(CAP_DAC_OVERRIDE, CAP_EFFECTIVE);
|
||||
|
||||
if (sc.error()) {
|
||||
RDC_LOG(RDC_ERROR,
|
||||
"Failed to acquire required capabilities. Errno " << sc.error());
|
||||
return RDC_ST_PERM_ERROR;
|
||||
}
|
||||
|
||||
ret = rsmi_dev_counter_create(dv_ind, evt, handle);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
return Rsmi2RdcError(ret);
|
||||
}
|
||||
|
||||
ret = rsmi_counter_control(*handle, RSMI_CNTR_CMD_START, nullptr);
|
||||
|
||||
// Release DAC capability
|
||||
sc.Relinquish();
|
||||
|
||||
if (sc.error()) {
|
||||
RDC_LOG(RDC_ERROR,
|
||||
"Failed to relinquish capabilities. Errno " << sc.error());
|
||||
return RDC_ST_PERM_ERROR;
|
||||
}
|
||||
|
||||
return Rsmi2RdcError(ret);
|
||||
}
|
||||
|
||||
@@ -561,6 +582,9 @@ rdc_status_t RdcMetricFetcherImpl::delete_rsmi_handle(RdcFieldKey fk) {
|
||||
ret = rsmi_counter_control(h, RSMI_CNTR_CMD_STOP, nullptr);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
rsmi_data_.erase(fk);
|
||||
|
||||
RDC_LOG(RDC_ERROR, "Error in stopping event counter: " <<
|
||||
Rsmi2RdcError(ret));
|
||||
return Rsmi2RdcError(ret);
|
||||
}
|
||||
|
||||
@@ -590,6 +614,7 @@ rdc_status_t RdcMetricFetcherImpl::acquire_rsmi_handle(RdcFieldKey fk) {
|
||||
|
||||
result = init_rsmi_counter(fk, grp, &handle);
|
||||
if (result != RDC_ST_OK) {
|
||||
RDC_LOG(RDC_ERROR, "Failed to init RSMI counter. Return:" << result);
|
||||
return result;
|
||||
}
|
||||
auto fsh = std::shared_ptr<FieldRSMIData>(new FieldRSMIData);
|
||||
@@ -636,6 +661,8 @@ rdc_status_t RdcMetricFetcherImpl::acquire_rsmi_handle(RdcFieldKey fk) {
|
||||
|
||||
RDC_LOG(RDC_ERROR, "No event counters are available for " <<
|
||||
field_id_to_descript.at(fk.second).enum_name << " event.");
|
||||
} else if (ret != RDC_ST_OK) {
|
||||
RDC_LOG(RDC_ERROR, "Error in getting event counter handle: " << ret);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -34,6 +34,7 @@ THE SOFTWARE.
|
||||
#include "rdc_lib/RdcLogger.h"
|
||||
#include "rdc_lib/impl/RdcSmiLib.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "common/rdc_capabilities.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
@@ -95,9 +96,19 @@ RdcNotificationImpl::set_listen_events(const std::vector<RdcFieldKey> fk_arr) {
|
||||
// No change to mask; nothing to be done
|
||||
continue;
|
||||
}
|
||||
|
||||
// Temporarily get DAC capability
|
||||
ScopedCapability sc(CAP_DAC_OVERRIDE, CAP_EFFECTIVE);
|
||||
|
||||
if (sc.error()) {
|
||||
RDC_LOG(RDC_ERROR,
|
||||
"Failed to acquire required capabilities. Errno " << sc.error());
|
||||
return RDC_ST_PERM_ERROR;
|
||||
}
|
||||
|
||||
ret = rsmi_event_notification_init(it->first);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_INFO,
|
||||
RDC_LOG(RDC_ERROR,
|
||||
"rsmi_event_notification_init() returned " << ret << " for device " <<
|
||||
it->first << ". " << std::endl <<
|
||||
" Will not listen for events on this device");
|
||||
@@ -105,6 +116,14 @@ RdcNotificationImpl::set_listen_events(const std::vector<RdcFieldKey> fk_arr) {
|
||||
}
|
||||
|
||||
ret = rsmi_event_notification_mask_set(it->first, it->second);
|
||||
// Release DAC capability
|
||||
sc.Relinquish();
|
||||
|
||||
if (sc.error()) {
|
||||
RDC_LOG(RDC_ERROR,
|
||||
"Failed to relinquish capabilities. Errno " << sc.error());
|
||||
return RDC_ST_PERM_ERROR;
|
||||
}
|
||||
|
||||
if (ret == RSMI_STATUS_SUCCESS) {
|
||||
gpu_evnt_notif_masks_[it->first] = it->second;
|
||||
@@ -162,7 +181,7 @@ RdcNotificationImpl::stop_listening(uint32_t gpu_id) {
|
||||
|
||||
ret = rsmi_event_notification_mask_set(gpu_id, 0);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_INFO, "rsmi_event_notification_mask_set() returned " << ret
|
||||
RDC_LOG(RDC_ERROR, "rsmi_event_notification_mask_set() returned " << ret
|
||||
<< " for device " << gpu_id);
|
||||
}
|
||||
|
||||
@@ -171,7 +190,7 @@ RdcNotificationImpl::stop_listening(uint32_t gpu_id) {
|
||||
std::lock_guard<std::mutex> guard(notif_mutex_);
|
||||
gpu_evnt_notif_masks_[gpu_id] = 0;
|
||||
} else {
|
||||
RDC_LOG(RDC_INFO, "rsmi_event_notification_stop() returned " << ret
|
||||
RDC_LOG(RDC_ERROR, "rsmi_event_notification_stop() returned " << ret
|
||||
<< " for device " << gpu_id);
|
||||
}
|
||||
return RDC_ST_OK;
|
||||
|
||||
@@ -98,13 +98,19 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields,
|
||||
|
||||
rdc_status_t RdcSmiLib::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count) {
|
||||
rdc_status_t ret;
|
||||
|
||||
if (fields == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < fields_count; i++) {
|
||||
metric_fetcher_->acquire_rsmi_handle(
|
||||
ret = metric_fetcher_->acquire_rsmi_handle(
|
||||
{fields[i].gpu_index, fields[i].field_id});
|
||||
if (ret != RDC_ST_OK) {
|
||||
RDC_LOG(RDC_ERROR,
|
||||
"Failed to acquire rocm_smi handle for field.");
|
||||
}
|
||||
}
|
||||
RDC_LOG(RDC_DEBUG, "acquire " << fields_count
|
||||
<< " field handles from rocm_smi_lib");
|
||||
|
||||
@@ -75,9 +75,9 @@ set(SERVER_SRC_LIST "${SRC_DIR}/rdc_rsmi_service.cc")
|
||||
set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${SRC_DIR}/rdc_admin_service.cc")
|
||||
set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${SRC_DIR}/rdc_api_service.cc")
|
||||
set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${SRC_DIR}/rdc_server_main.cc")
|
||||
set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${SRC_DIR}/rdc_server_utils.cc")
|
||||
set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${PROTOBUF_GENERATED_SRCS}")
|
||||
set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${RDC_SRC_ROOT}/common/rdc_utils.cc")
|
||||
set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${RDC_SRC_ROOT}/common/rdc_capabilities.cc")
|
||||
message("SERVER_SRC_LIST=${SERVER_SRC_LIST}")
|
||||
|
||||
set(SERVER_DAEMON_EXE "rdcd")
|
||||
|
||||
@@ -41,7 +41,7 @@ THE SOFTWARE.
|
||||
#include "rdc/rdc_server_main.h"
|
||||
#include "rdc/rdc_rsmi_service.h"
|
||||
#include "rdc/rdc_api_service.h"
|
||||
#include "rdc/rdc_server_utils.h"
|
||||
#include "common/rdc_capabilities.h"
|
||||
#include "common/rdc_utils.h"
|
||||
|
||||
// TODO(cfreehil):
|
||||
|
||||
Reference in New Issue
Block a user