Add SSL mutual authentication support for rdci

The RDC API is changed to pass the certificates to the gRPC.

Add the support to add all GPUs in the host to a group. Also before
add a GPU to a group, the RDC API will verify that GPU exists or not.

Add the support to fetch the temperature metrics.

Change-Id: I5857ef03fede233d16e8b2836be120f33172da93
This commit is contained in:
Bill(Shuzhou) Liu
2020-03-10 14:02:05 -04:00
committad av Chris Freehill
förälder 023de40df7
incheckning 66e4e790c3
20 ändrade filer med 232 tillägg och 57 borttagningar
+14 -1
Visa fil
@@ -1,9 +1,22 @@
# rdc
Radeon Data Center
## To run the rdcd and rdci from the build folder
## To run the rdcd and rdci from the build folder without authentication
```
sudo LD_LIBRARY_PATH=$PWD/rdc_libs/ ./server/rdcd -u
LD_LIBRARY_PATH=$PWD/rdc_libs/ ./rdci/rdci discovery -u
```
## To run the rdcd and rdci from the build folder with authentication
```
sudo LD_LIBRARY_PATH=$PWD/rdc_libs/ ./server/rdcd
LD_LIBRARY_PATH=$PWD/rdc_libs/ ./rdci/rdci discovery
```
## Troubleshooting
Check the ssl connection in rdci:
```
rdcd_hostname= "" # Set the rdcd you want to connect
openssl s_client -connect $rdcd_hostname:50051 -cert /etc/rdc/client/certs/rdc_client_cert.pem -key /etc/rdc/client/private/rdc_client_cert.key -CAfile /etc/rdc/client/certs/rdc_cacert.pem
```
+3 -1
Visa fil
@@ -23,6 +23,7 @@ nameopt = default_ca
certopt = default_ca
policy = policy_match
unique_subject = no
copy_extensions = copyall
[ policy_match ]
countryName = match
@@ -77,6 +78,7 @@ authorityKeyIdentifier = keyid:always,issuer:always
[ v3_req ]
basicConstraints = CA:FALSE
subjectKeyIdentifier = hash
subjectAltName = @alt_names
[ req_ext ]
subjectAltName = @alt_names
@@ -84,5 +86,5 @@ subjectAltName = @alt_names
[alt_names]
# < ** MODIFY BELOW TO YOUR NEEDS. WILDCARDS ARE ACCEPTED. **>
DNS.1 = localhost
DNS.2 = another-website.dev
DNS.2 = *.amd.com
+2 -1
Visa fil
@@ -63,7 +63,8 @@ int main(int, char **) {
}
if (standalone) { // standalone
result = rdc_connect(hostIpAddress, &rdc_handle);
result = rdc_connect(hostIpAddress, &rdc_handle,
nullptr, nullptr, nullptr);
if ( result != RDC_ST_OK ) {
std::cout << "Error connecting to remote rdcd. Return: "
<< rdc_status_string(result) << std::endl;
+2 -1
Visa fil
@@ -58,7 +58,8 @@ int main(int, char **) {
}
if (standalone) { // standalone
result = rdc_connect(hostIpAddress, &rdc_handle);
result = rdc_connect(hostIpAddress, &rdc_handle,
nullptr, nullptr, nullptr);
if ( result != RDC_ST_OK ) {
std::cout << "Error connecting to remote rdcd. Return: "
<< rdc_status_string(result) << std::endl;
+13 -3
Visa fil
@@ -309,20 +309,30 @@ rdc_status_t rdc_stop_embedded(rdc_handle_t p_rdc_handle);
/**
* @brief Connect to rdcd daemon
*
* @details This method is used to connect to a remote stand-alone rdcd daemon.
* This function is not thread safe.
* @details This method is used to connect to a remote stand-alone
* rdcd daemon. This function is not thread safe.
*
* @param[in] ipAndPort The IP and port of the remote rdcd. The ipAndPort
* can be specified in this x.x.x.x:yyyy format, where x.x.x.x is the
* IP address and yyyy is the port.
*
* @param [in] root_ca The root CA stored in the string in pem format. Set it
* as nullptr if the communication is not encrypted.
*
* @param [in] client_cert The client certificate stored in the string in pem
* format. Set it as nullptr if the communication is not encrypted.
*
* @param [in] root_ca The client key stored in the string in pem format.
* Set it as nullptr if the communication is not encrypted.
*
* @param[inout] p_rdc_handle Caller provided pointer to rdc_handle_t. Upon
* successful call, the value will contain the handler
* for following API calls.
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_connect(const char *ipAndPort, rdc_handle_t* p_rdc_handle);
rdc_status_t rdc_connect(const char *ipAndPort, rdc_handle_t* p_rdc_handle,
const char* root_ca, const char* client_cert, const char* client_key);
/**
* @brief Disconnect from rdcd daemon.
+2 -2
Visa fil
@@ -31,8 +31,8 @@ namespace rdc {
class RdcGroupSettings {
public:
virtual rdc_status_t rdc_group_gpu_create(rdc_group_type_t type,
const char* group_name, rdc_gpu_group_t* p_rdc_group_id) = 0;
virtual rdc_status_t rdc_group_gpu_create(const char* group_name,
rdc_gpu_group_t* p_rdc_group_id) = 0;
virtual rdc_status_t rdc_group_gpu_destroy(
rdc_gpu_group_t p_rdc_group_id) = 0;
virtual rdc_status_t rdc_group_gpu_add(
+1 -1
Visa fil
@@ -34,7 +34,7 @@ namespace rdc {
class RdcGroupSettingsImpl: public RdcGroupSettings {
public:
rdc_status_t rdc_group_gpu_create(rdc_group_type_t type,
rdc_status_t rdc_group_gpu_create(
const char* group_name, rdc_gpu_group_t* p_rdc_group_id) override;
rdc_status_t rdc_group_gpu_destroy(
rdc_gpu_group_t p_rdc_group_id) override;
+4 -2
Visa fil
@@ -79,7 +79,8 @@ class RdcStandaloneHandler: public RdcHandler {
// Control RdcAPI
rdc_status_t rdc_update_all_fields(uint32_t wait_for_update) override;
explicit RdcStandaloneHandler(const char* ip_and_port);
explicit RdcStandaloneHandler(const char* ip_and_port,
const char* root_ca, const char* client_cert, const char* client_key);
private:
// Helper function to handle the error
@@ -92,7 +93,8 @@ class RdcStandaloneHandler: public RdcHandler {
} // namespace amd
extern "C" {
amd::rdc::RdcHandler *make_handler(const char* ip_port);
amd::rdc::RdcHandler *make_handler(const char* ip_port,
const char* root_ca, const char* client_cert, const char* client_key);
}
#endif // RDC_LIB_IMPL_RDCSTANDALONEHANDLER_H_
+7 -3
Visa fil
@@ -41,8 +41,11 @@ rdc_status_t rdc_shutdown() {
}
rdc_status_t rdc_connect(const char* ipAddress,
rdc_handle_t* p_rdc_handle ) {
amd::rdc::RdcHandler* (*func_make_handler)(const char*);
rdc_handle_t* p_rdc_handle,
const char* root_ca, const char* client_cert,
const char* client_key ) {
amd::rdc::RdcHandler* (*func_make_handler)(const char*,
const char*, const char*, const char*);
if (!ipAddress || !p_rdc_handle) {
return RDC_ST_FAIL_LOAD_MODULE;
@@ -63,7 +66,8 @@ rdc_status_t rdc_connect(const char* ipAddress,
}
*p_rdc_handle = static_cast<rdc_handle_t>
(func_make_handler(ipAddress));
(func_make_handler(ipAddress,
root_ca, client_cert, client_key));
return RDC_ST_OK;
}
+50 -3
Visa fil
@@ -27,13 +27,22 @@ THE SOFTWARE.
#include "rdc_lib/impl/RdcCacheManagerImpl.h"
#include "rdc_lib/impl/RdcWatchTableImpl.h"
#include "rdc_lib/rdc_common.h"
#include "rdc_lib/RdcException.h"
#include "rocm_smi/rocm_smi.h"
namespace {
// call the rsmi_init when load library
// and rsmi_shutdown when unload the library.
class rsmi_initializer {
rsmi_initializer() { rsmi_init(0);}
rsmi_initializer() {
// Make sure rsmi will not be initialized multiple times
rsmi_shut_down();
rsmi_status_t rsmi_ret = rsmi_init(0);
if (rsmi_ret != RSMI_STATUS_SUCCESS) {
throw amd::rdc::RdcException(
RDC_ST_FAIL_LOAD_MODULE, "RSMI initialize fail");
}
}
~rsmi_initializer() { rsmi_shut_down();}
public:
static rsmi_initializer& getInstance() {
@@ -144,12 +153,50 @@ rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_create(rdc_group_type_t type,
if (!group_name || !p_rdc_group_id) {
return RDC_ST_BAD_PARAMETER;
}
return group_settings_->
rdc_group_gpu_create(type, group_name, p_rdc_group_id);
rdc_status_t status = group_settings_->
rdc_group_gpu_create(group_name, p_rdc_group_id);
if (status != RDC_ST_OK || type == RDC_GROUP_EMPTY) {
return status;
}
// Add All GPUs to the group
uint32_t count = 0;
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES];
status = rdc_get_all_devices(
gpu_index_list, &count);
if (status != RDC_ST_OK) {
return status;
}
for (uint32_t i=0; i < count; i++) {
status = rdc_group_gpu_add(*p_rdc_group_id, gpu_index_list[i]);
}
return status;
}
rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_add(rdc_gpu_group_t group_id,
uint32_t gpu_index) {
uint32_t count = 0;
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES];
rdc_status_t status = rdc_get_all_devices(
gpu_index_list, &count);
if (status != RDC_ST_OK) {
return status;
}
bool is_gpu_exist = false;
for (uint32_t i=0; i < count; i++) {
if (gpu_index_list[i] == gpu_index) {
is_gpu_exist = true;
break;
}
}
if (!is_gpu_exist) {
return RDC_ST_NOT_FOUND;
}
return group_settings_->rdc_group_gpu_add(group_id, gpu_index);
}
+4 -6
Visa fil
@@ -29,15 +29,11 @@ namespace rdc {
RdcGroupSettingsImpl::RdcGroupSettingsImpl() {
}
rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_create(rdc_group_type_t type,
rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_create(
const char* group_name, rdc_gpu_group_t* p_rdc_group_id) {
// TODO(bill_liu): handle type to create default group for all GPUs
if (type == RDC_GROUP_DEFAULT) {
return RDC_ST_NOT_SUPPORTED;
}
rdc_group_info_t ginfo;
strncpy_with_null(ginfo.group_name, group_name, RDC_MAX_STR_LENGTH);
ginfo.count = 0;
std::lock_guard<std::mutex> guard(group_mutex_);
gpu_group_.emplace(cur_group_id_, ginfo);
@@ -72,6 +68,8 @@ rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_add(
} else {
return RDC_ST_MAX_LIMIT;
}
} else {
return RDC_ST_NOT_FOUND;
}
return RDC_ST_OK;
+10 -2
Visa fil
@@ -34,11 +34,10 @@ namespace rdc {
bool RdcMetricFetcherImpl::is_field_valid(uint32_t field_id) const {
const std::vector<uint32_t> all_fields = {RDC_FI_GPU_MEMORY_USAGE,
RDC_FI_GPU_MEMORY_TOTAL, RDC_FI_GPU_COUNT, RDC_FI_POWER_USAGE,
RDC_FI_GPU_SM_CLOCK, RDC_FI_GPU_UTIL, RDC_FI_DEV_NAME};
RDC_FI_GPU_SM_CLOCK, RDC_FI_GPU_UTIL, RDC_FI_DEV_NAME, RDC_FI_GPU_TEMP};
return std::find(all_fields.begin(), all_fields.end(), field_id)
!= all_fields.end();
}
rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index,
@@ -113,6 +112,15 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index,
value->value.str, RDC_MAX_STR_LENGTH);
value->type = STRING;
break;
case RDC_FI_GPU_TEMP:
int64_t val_i64;
value->status = rsmi_dev_temp_metric_get(gpu_index,
0, RSMI_TEMP_CURRENT, &val_i64);
value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = val_i64;
}
break;
default:
break;
}
+2 -1
Visa fil
@@ -44,7 +44,8 @@ void RdcMetricsUpdaterImpl::start() {
updater_ = std::async(std::launch::async, [this](){
while (started_) {
watch_table_->rdc_update_all_fields();
std::this_thread::sleep_for(std::chrono::microseconds(100));
std::this_thread::sleep_for(
std::chrono::microseconds(_check_frequency));
}
});
}
+21 -11
Visa fil
@@ -23,18 +23,31 @@ THE SOFTWARE.
#include <grpcpp/grpcpp.h>
#include "rdc.grpc.pb.h" // NOLINT
amd::rdc::RdcHandler *make_handler(const char* ip_and_port) {
return new amd::rdc::RdcStandaloneHandler(ip_and_port);
amd::rdc::RdcHandler *make_handler(const char* ip_and_port,
const char* root_ca, const char* client_cert, const char* client_key) {
return new amd::rdc::RdcStandaloneHandler(ip_and_port,
root_ca, client_cert, client_key);
}
namespace amd {
namespace rdc {
RdcStandaloneHandler::RdcStandaloneHandler(const char* ip_and_port):
stub_(::rdc::RdcAPI::NewStub(grpc::CreateChannel(ip_and_port,
grpc::InsecureChannelCredentials()))) {
}
RdcStandaloneHandler::RdcStandaloneHandler(const char* ip_and_port,
const char* root_ca, const char* client_cert, const char* client_key) {
std::shared_ptr<grpc::ChannelCredentials> cred(nullptr);
if (root_ca == nullptr || client_cert == nullptr
|| client_key == nullptr) {
cred = grpc::InsecureChannelCredentials();
} else {
grpc::SslCredentialsOptions sslOpts{};
sslOpts.pem_root_certs = root_ca;
sslOpts.pem_private_key = client_key;
sslOpts.pem_cert_chain = client_cert;
cred = grpc::SslCredentials(sslOpts);
}
stub_ = ::rdc::RdcAPI::NewStub(grpc::CreateChannel(ip_and_port, cred));
}
rdc_status_t RdcStandaloneHandler::error_handle(::grpc::Status status,
uint32_t rdc_status) {
@@ -44,10 +57,7 @@ rdc_status_t RdcStandaloneHandler::error_handle(::grpc::Status status,
return RDC_ST_CLIENT_ERROR;
}
if (rdc_status != RDC_ST_OK) {
return static_cast<rdc_status_t>(rdc_status);
}
return RDC_ST_OK;
return static_cast<rdc_status_t>(rdc_status);
}
// JOB RdcAPI
+4 -2
Visa fil
@@ -64,11 +64,13 @@ set(SRC_DIR "${PROJECT_SOURCE_DIR}/rdci/src")
set(INC_DIR "${PROJECT_SOURCE_DIR}/rdci/include")
set(LIB_BOOSTRAP_DIR "${PROJECT_BINARY_DIR}/rdc_libs")
include_directories(${INC_DIR} ${PROJECT_SOURCE_DIR}/include)
include_directories(${INC_DIR} ${PROJECT_SOURCE_DIR}/include
${PROJECT_SOURCE_DIR})
set(RDCI_SRC_LIST "${SRC_DIR}/rdci.cc")
set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${SRC_DIR}/RdciDisCoverySubSystem.cc")
set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${SRC_DIR}/RdciDiscoverySubSystem.cc")
set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${SRC_DIR}/RdciSubSystem.cc")
set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${PROJECT_SOURCE_DIR}/common/rdc_utils.cc")
message("RDCI_SRC_LIST=${RDCI_SRC_LIST}")
set(RDCI_EXE "rdci")
+6
Visa fil
@@ -39,8 +39,14 @@ class RdciSubSystem {
virtual void process() = 0;
virtual ~RdciSubSystem();
protected:
void show_common_usage() const;
rdc_handle_t rdc_handle_;
std::string ip_port_;
bool use_auth_;
std::string root_ca_;
std::string client_cert_;
std::string client_key_;
};
typedef std::shared_ptr<RdciSubSystem> RdciSubSystemPtr;
+10 -11
Visa fil
@@ -23,14 +23,14 @@ THE SOFTWARE.
#include <unistd.h>
#include "rdc_lib/rdc_common.h"
#include "rdc/rdc.h"
#include "RdcException.h"
#include "rdc_lib/RdcException.h"
#include "RdciDiscoverySubSystem.h"
namespace amd {
namespace rdc {
RdciDiscoverySubSystem::RdciDiscoverySubSystem() :show_help_(false) {
RdciDiscoverySubSystem::RdciDiscoverySubSystem() : show_help_(false) {
}
void RdciDiscoverySubSystem::parse_cmd_opts(int argc, char ** argv) {
@@ -38,13 +38,14 @@ void RdciDiscoverySubSystem::parse_cmd_opts(int argc, char ** argv) {
const struct option long_options[] = {
{"host", required_argument, nullptr, HOST_OPTIONS },
{"help", optional_argument, nullptr, 'h' },
{"unauth", optional_argument, nullptr, 'u' },
{ nullptr, 0 , nullptr, 0 }
};
int option_index = 0;
int opt = 0;
while ((opt = getopt_long(argc, argv, "h",
while ((opt = getopt_long(argc, argv, "hu",
long_options, &option_index)) != -1) {
switch (opt) {
case HOST_OPTIONS:
@@ -53,27 +54,25 @@ void RdciDiscoverySubSystem::parse_cmd_opts(int argc, char ** argv) {
case 'h':
show_help_ = true;
return;
case 'u':
use_auth_ = false;
break;
default:
show_help();
throw RdcException(RDC_ST_BAD_PARAMETER,
"Unknown command line options");
}
}
}
void RdciDiscoverySubSystem::show_help() const {
std::cout << " discovery -- Used to discover and identify GPUs "
<< "and their attributes.\n\n";
std::cout << "Usage\n";
std::cout << " rdci discovery [--host <IP/FQDN>:port]\n";
std::cout << " rdci discovery [--host <IP/FQDN>:port] [-u]\n";
std::cout << "\nFlags:\n";
std::cout << " --host <IP/FQDN>:port Connects to "
<< "specified IP or fully-qualified domain name.\n";
std::cout << " The port "
<< "must be specified.\n";
std::cout << " Default: localhost:50051\n";
std::cout << " -h --help Displays usage "
<< "information and exits.\n";
show_common_usage();
}
+76 -5
Visa fil
@@ -20,14 +20,19 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "RdciSubSystem.h"
#include "RdcException.h"
#include "rdc_lib/RdcException.h"
#include "common/rdc_utils.h"
namespace amd {
namespace rdc {
RdciSubSystem::RdciSubSystem():
rdc_handle_(nullptr)
, ip_port_("localhost:50051") { // default host
, ip_port_("localhost:50051") // default host
, use_auth_(true)
, root_ca_("/etc/rdc/client/certs/rdc_cacert.pem")
, client_cert_("/etc/rdc/client/certs/rdc_client_cert.pem")
, client_key_("/etc/rdc/client/private/rdc_client_cert.key") {
rdc_status_t status = rdc_init(0);
if (status != RDC_ST_OK) {
throw RdcException(status, "RDC initialize fail");
@@ -35,12 +40,79 @@ RdciSubSystem::RdciSubSystem():
}
void RdciSubSystem::connect() {
rdc_status_t status = rdc_connect(ip_port_.c_str(), &rdc_handle_);
rdc_status_t status;
if (use_auth_) {
std::string ca_pem;
std::string client_cert_pem;
std::string client_key_pem;
if (!FileExists(root_ca_.c_str())) {
std::cout << "In order to use the SSL mutual authentication, the "
<< "root CA must be copied to " << root_ca_ << std::endl;
throw RdcException(RDC_ST_BAD_PARAMETER, "root CA not found");
}
int ret = ReadFile(root_ca_, &ca_pem);
if (ret) {
throw RdcException(RDC_ST_BAD_PARAMETER,
std::string("Fail to read root CA at") + root_ca_);
}
if (!FileExists(client_cert_.c_str())) {
std::cout << "In order to use the SSL mutual authentication, the "
<< "client certificate must be copied to "
<< client_cert_ << std::endl;
throw RdcException(RDC_ST_BAD_PARAMETER,
"client cert not found");
}
ret = ReadFile(client_cert_, &client_cert_pem);
if (ret) {
throw RdcException(RDC_ST_BAD_PARAMETER,
std::string("Fail to read client certificate at") + client_cert_);
}
if (!FileExists(client_key_.c_str())) {
std::cout << "In order to use the SSL mutual authentication, the "
<< "client private key must be copied to "
<< client_key_ << std::endl;
throw RdcException(RDC_ST_BAD_PARAMETER,
"client key not found");
}
ret = ReadFile(client_key_, &client_key_pem);
if (ret) {
throw RdcException(RDC_ST_BAD_PARAMETER,
std::string("Fail to read client key at ") + client_key_);
}
status = rdc_connect(ip_port_.c_str(), &rdc_handle_,
ca_pem.c_str(), client_cert_pem.c_str(), client_key_pem.c_str());
} else { // Not use the SSL mutual authentication
status = rdc_connect(ip_port_.c_str(), &rdc_handle_,
nullptr, nullptr, nullptr);
}
if (status != RDC_ST_OK) {
throw RdcException(status, "Fail to setup the connection");
throw RdcException(status,
"Fail to setup the connection. Please check all libraries in right folder");
}
}
void RdciSubSystem::show_common_usage() const {
std::cout << " --host <IP/FQDN>:port Connects to "
<< "specified IP or fully-qualified domain name.\n";
std::cout << " The port "
<< "must be specified.\n";
std::cout << " Default: localhost:50051\n";
std::cout << " -u --unauth Do not use the SSL mutual"
<< " authentication to encrypt the communication\n"
<< " Default: SSL mutual will be"
<< " used. You must copy the root CA to "
<< root_ca_ << "\n"
<< " Client certificate to "
<< client_cert_ << "\n"
<< " Client key to "
<< client_key_ << "\n";
std::cout << " -h --help Displays usage "
<< "information and exits.\n";
}
RdciSubSystem::~RdciSubSystem() {
if (rdc_handle_) {
@@ -51,6 +123,5 @@ RdciSubSystem::~RdciSubSystem() {
rdc_shutdown();
}
} // namespace rdc
} // namespace amd
+1 -1
Visa fil
@@ -24,7 +24,7 @@ THE SOFTWARE.
#include <string>
#include "rdc_lib/rdc_common.h"
#include "rdc/rdc.h"
#include "RdcException.h"
#include "rdc_lib/RdcException.h"
#include "RdciDiscoverySubSystem.h"