Break srvs. into rsmi & admin srvs. Add VerifyConnection api.

Change-Id: I67567264c37e31f3409062a14e56eba4801cd944


[ROCm/rdc commit: dc6f6f3e9a]
This commit is contained in:
Chris Freehill
2019-12-22 20:30:58 -06:00
parent bc7f01e992
commit ba14edbb4d
19 ha cambiato i file con 316 aggiunte e 87 eliminazioni
+2 -2
Vedi File
@@ -25,8 +25,8 @@ cmake_minimum_required(VERSION 3.5.0)
# ROCM_DIR should be passed in via command line; these will be used
# in sub-projects
set(RSMI_INC_DIR ${ROCM_DIR}/include)
set(RSMI_LIB_DIR ${ROCM_DIR}/lib)
set(RSMI_INC_DIR ${ROCM_DIR}/rocm_smi/include)
set(RSMI_LIB_DIR ${ROCM_DIR}/rocm_smi/lib)
## Set default module path
if(NOT DEFINED CMAKE_MODULE_PATH)
+14 -11
Vedi File
@@ -67,7 +67,7 @@ set(CLIENT_LIB "rdc_client")
set(RDC "rdc")
set(CLIENT_LIB_COMPONENT "lib${CLIENT_LIB}")
set(SRC_DIR "${PROJECT_SOURCE_DIR}/client/src")
set(INC_DIR "${PROJECT_SOURCE_DIR}/client/include/rdc")
set(RDC_CLIENT_INC_DIR "${PROJECT_SOURCE_DIR}/client/include/rdc")
################# Determine the library version #########################
## Setup the SO version based on git tags.
@@ -119,27 +119,30 @@ set(CMAKE_VERBOSE_MAKEFILE on)
file(GLOB PROTOBUF_GENERATED_INCLUDES "${PROTOB_OUT_DIR}/*.h")
file(GLOB PROTOBUF_GENERATED_SRCS "${PROTOB_OUT_DIR}/*.cc")
include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include"
"${PROJECT_SOURCE_DIR}"
"${PROTOB_OUT_DIR}" "${RSMI_INC_DIR}")
set(CLIENT_LIB_SRC_LIST "${SRC_DIR}/rdc_client.cc")
set(CLIENT_LIB_SRC_LIST ${CLIENT_LIB_SRC_LIST} "${SRC_DIR}/rdc_main.cc")
set(CLIENT_LIB_SRC_LIST ${CLIENT_LIB_SRC_LIST} "${SRC_DIR}/rdc_client_main.cc")
set(CLIENT_LIB_SRC_LIST ${CLIENT_LIB_SRC_LIST} "${PROTOBUF_GENERATED_SRCS}")
set(CLIENT_LIB_SRC_LIST ${CLIENT_LIB_SRC_LIST}
"${PROJECT_SOURCE_DIR}/common/rdc_utils.cc")
message("CLIENT_LIB_SRC_LIST=${CLIENT_LIB_SRC_LIST}")
set(CLIENT_LIB_INC_LIST "${INC_DIR}/rdc_client.h")
set(CLIENT_LIB_INC_LIST ${CLIENT_LIB_INC_LIST} "${INC_DIR}/rdc_exception.h")
set(CLIENT_LIB_INC_LIST ${CLIENT_LIB_INC_LIST} "${INC_DIR}/rdc_main.h")
set(CLIENT_LIB_INC_LIST "${RDC_CLIENT_INC_DIR}/rdc_client.h")
set(CLIENT_LIB_INC_LIST ${CLIENT_LIB_INC_LIST}
"${PROJECT_SOURCE_DIR}/common/rdc_utils.h")
"${RDC_CLIENT_INC_DIR}/rdc_exception.h")
set(CLIENT_LIB_INC_LIST ${CLIENT_LIB_INC_LIST}
"${RDC_CLIENT_INC_DIR}/rdc_client_main.h")
set(CLIENT_LIB_INC_LIST ${CLIENT_LIB_INC_LIST}
"${PROJECT_SOURCE_DIR}/common/rdc_utils.h")
add_library(${CLIENT_LIB} SHARED ${CLIENT_LIB_SRC_LIST} ${CLIENT_LIB_INC_LIST})
target_link_libraries(${CLIENT_LIB} pthread rt grpc grpc++ grpc++_reflection
dl protobuf)
target_include_directories(${CLIENT_LIB} PUBLIC ${INC_DIR})
target_include_directories(${CLIENT_LIB} PRIVATE
"${PROJECT_SOURCE_DIR}"
"${PROJECT_SOURCE_DIR}/include"
"${CMAKE_CURRENT_SOURCE_DIR}/include"
"${PROTOB_OUT_DIR}"
"${RSMI_INC_DIR}")
# TODO: set the properties for the library once we have one
## Set the VERSION and SOVERSION values
set_property(TARGET ${CLIENT_LIB} PROPERTY
@@ -24,6 +24,8 @@ THE SOFTWARE.
#ifndef CLIENT_INCLUDE_RDC_RDC_CLIENT_H_
#define CLIENT_INCLUDE_RDC_RDC_CLIENT_H_
#include <grpcpp/grpcpp.h>
#include <memory>
#include <string>
#include "rocm_smi/rocm_smi.h"
@@ -192,6 +194,64 @@ typedef uintptr_t rdc_channel_t;
#define RDC_DEFAULT_SERVER_PORT 50051
#define RDC_DEFAULT_SERVER_IP "localhost"
/*****************************************************************************/
/** @defgroup RDCAdmin RDC Administration Functions
* These administrative functions are used to monitor and control, for
* example RDC connectivity.
* @{
*/
/**
* @brief Check the connection status of a channel
*
* @details Given an ::rdc_channel_t @p channel and a boolean @p
* try_to_connect, this function will return the grpc_connectivity_state for
* that channel
*
* @p channel[in] The channel for which the status will be given
*
* @param[in] try_to_connect If the channel is currently IDLE, if the argument
* is true, transition to CONNECTING.
*
* @param[inout] state A pointer to caller provided memory to which an
* the grpc_connectivity_state will be written. grpc_connectivity_state has
* the following possible values:
* GRPC_CHANNEL_IDLE channel is idle
* GRPC_CHANNEL_CONNECTING channel is connecting
* GRPC_CHANNEL_READY channel is ready for work
* GRPC_CHANNEL_TRANSIENT_FAILURE channel has seen a failure but expects to
* recover
* GRPC_CHANNEL_SHUTDOWN channel has seen a failure that it cannot
* recover from
*
* @retval ::RDC_STATUS_SUCCESS is returned upon successful call.
*
*/
rdc_status_t
rdc_channel_state_get(rdc_channel_t channel, bool try_to_connect,
grpc_connectivity_state *state);
/**
* @brief Verify a channel's connection to the server
*
* @details Given an ::rdc_channel_t @p channel, this function will send a
* random number to the server associated with @p channel. The server will send
* the number back. Upon receiving the returned message from the server, the
* number sent to the server is compared to the number received from the
* server. If the 2 numbers are the same, the connection is verified.
* Otherwise, an appropriate error code is returned.
*
* @p channel[in] The channel for which the connection will be verified
*
* @retval ::RDC_STATUS_SUCCESS is returned upon successful call.
*
*/
rdc_status_t
rdc_channel_connection_verify(rdc_channel_t channel);
/** @} */ // end of RDCAdmin
/*****************************************************************************/
/** @defgroup InitShutAdmin Initialization and Shutdown
* These functions are used for initialization of RDC and clean up when
@@ -216,7 +276,7 @@ typedef uintptr_t rdc_channel_t;
*
* @param[in] port A pointer to string containing the port on which the
* RDC server is listening
*
*
* @param[in] secure A bool indicating whether SSL should be used for
* communications (not currently supported)
*
@@ -21,8 +21,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef CLIENT_INCLUDE_RDC_RDC_MAIN_H_
#define CLIENT_INCLUDE_RDC_RDC_MAIN_H_
#ifndef CLIENT_INCLUDE_RDC_RDC_CLIENT_MAIN_H_
#define CLIENT_INCLUDE_RDC_RDC_CLIENT_MAIN_H_
#include <string>
#include <memory>
@@ -48,15 +48,21 @@ class RDCChannel {
std::string server_ip(void) const {return server_ip_;}
std::string server_port(void) const {return server_port_;}
bool secure_channel(void) const {return secure_channel_;}
std::shared_ptr<::rdc::Rsmi::Stub> stub(void) const {return stub_;}
std::shared_ptr<::rdc::Rsmi::Stub> rsmi_stub(void) const {return rsmi_stub_;}
std::shared_ptr<::rdc::RdcAdmin::Stub> rdc_admin_stub(void) const {
return rdc_admin_stub_;}
std::shared_ptr<grpc::Channel> const channel(void) {return channel_;}
private:
std::string server_ip_;
std::string server_port_;
bool secure_channel_;
std::shared_ptr<::rdc::Rsmi::Stub> stub_;
std::shared_ptr<::rdc::Rsmi::Stub> rsmi_stub_;
std::shared_ptr<::rdc::RdcAdmin::Stub> rdc_admin_stub_;
std::shared_ptr<grpc::Channel> channel_;
};
} // namespace rdc
} // namespace amd
#endif // CLIENT_INCLUDE_RDC_RDC_MAIN_H_
#endif // CLIENT_INCLUDE_RDC_RDC_CLIENT_MAIN_H_
+46 -4
Vedi File
@@ -1,5 +1,5 @@
/*
Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2019 - Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -22,10 +22,11 @@ THE SOFTWARE.
#include <grpcpp/grpcpp.h>
#include <time.h>
#include <unistd.h>
#include <iostream>
#include "rdc/rdc_main.h"
#include "rdc/rdc_client_main.h"
#include "rdc/rdc_client.h"
#include "common/rdc_utils.h"
#include "rdc/rdc_exception.h"
@@ -113,6 +114,45 @@ rdc_channel_create(rdc_channel_t *channel, const char *ip,
CATCH
}
rdc_status_t
rdc_channel_state_get(rdc_channel_t channel, bool try_to_connect,
grpc_connectivity_state *state) {
TRY
CHK_PTR_ARG(state)
UINTPTR_TO_RDC_CHAN(channel)
*state = ch->channel()->GetState(try_to_connect);
return RDC_STATUS_SUCCESS;
CATCH
}
rdc_status_t
rdc_channel_connection_verify(rdc_channel_t channel) {
TRY
UINTPTR_TO_RDC_CHAN(channel)
::rdc::VerifyConnectionResponse resp;
::rdc::VerifyConnectionRequest req;
::grpc::ClientContext context;
unsigned int seed = time(NULL);
req.set_magic_num(static_cast<uint64_t>(rand_r(&seed)));
::grpc::Status status =
ch->rdc_admin_stub()->VerifyConnection(&context, req, &resp);
if (!status.ok()) {
return amd::rdc::GrpcErrorToRdcError(status.error_code());
}
if (resp.echo_magic_num() != req.magic_num()) {
return RDC_STATUS_GRPC_DATA_LOSS;
}
return RDC_STATUS_SUCCESS;
CATCH
}
rdc_status_t
rdc_channel_destroy(rdc_channel_t channel) {
@@ -135,7 +175,8 @@ rdc_num_gpus_get(rdc_channel_t channel, uint64_t *num_gpu) {
::rdc::GetNumDevicesResponse resp;
::rdc::GetNumDevicesRequest empty;
::grpc::ClientContext context;
::grpc::Status status = ch->stub()->GetNumDevices(&context, empty, &resp);
::grpc::Status status =
ch->rsmi_stub()->GetNumDevices(&context, empty, &resp);
if (!status.ok()) {
return amd::rdc::GrpcErrorToRdcError(status.error_code());
@@ -171,7 +212,8 @@ rdc_dev_temp_metric_get(rdc_channel_t channel, uint32_t dv_ind,
in_args.set_dv_ind(dv_ind);
in_args.set_sensor_type(sensor_type);
::grpc::Status status = ch->stub()->GetTemperature(&context, in_args, &resp);
::grpc::Status status =
ch->rsmi_stub()->GetTemperature(&context, in_args, &resp);
if (!status.ok()) {
return ::amd::rdc::GrpcErrorToRdcError(status.error_code());
@@ -27,7 +27,7 @@ THE SOFTWARE.
#include <string>
#include "rdc.grpc.pb.h" // NOLINT
#include "rdc/rdc_main.h"
#include "rdc/rdc_client_main.h"
#include "rdc/rdc_client.h"
namespace amd {
@@ -48,21 +48,25 @@ RDCChannel::Initialize(void) {
std::string addr_str = server_ip() + ":";
addr_str += server_port();
std::shared_ptr<grpc::Channel> channel;
if (secure_channel_) {
// Not yet supported
return RDC_STATUS_GRPC_UNIMPLEMENTED;
} else {
channel = ::grpc::CreateChannel(addr_str,
channel_ = ::grpc::CreateChannel(addr_str,
grpc::InsecureChannelCredentials());
}
stub_ = ::rdc::Rsmi::NewStub(channel);
if (stub_ == nullptr) {
rsmi_stub_ = ::rdc::Rsmi::NewStub(channel_);
if (rsmi_stub_ == nullptr) {
return RDC_STATUS_GRPC_RESOURCE_EXHAUSTED;
}
rdc_admin_stub_ = ::rdc::RdcAdmin::NewStub(channel_);
if (rdc_admin_stub_ == nullptr) {
return RDC_STATUS_GRPC_RESOURCE_EXHAUSTED;
}
// Test to see if we can connect to server; if not, return err.
return RDC_STATUS_SUCCESS;
}
+1 -1
Vedi File
@@ -34,7 +34,7 @@ rdc_status_t GrpcErrorToRdcError(grpc::StatusCode grpc_err) {
uint32_t rdc_grpc_base_int =
static_cast<uint32_t>(RDC_STATUS_GRPC_ERR_FIRST);
uint32_t rdc_err_int = grpc_err_int + rdc_grpc_base_int;
return static_cast<rdc_status_t>(rdc_err_int);
}
File binario non mostrato.
+26 -18
Vedi File
@@ -27,7 +27,18 @@ syntax = "proto3";
// option objc_class_prefix = "HLW";
package rdc;
/****************************************************************************/
/********************************** Rsmi Service ****************************/
/****************************************************************************/
service Rsmi {
// RSMI ID services
rpc GetNumDevices (GetNumDevicesRequest) returns(GetNumDevicesResponse) {}
// RSMI Physical Queries
rpc GetTemperature(GetTemperatureRequest) returns(GetTemperatureResponse){}
}
// rsmi_num_monitor_devices()
message GetNumDevicesRequest {
}
@@ -36,14 +47,6 @@ message GetNumDevicesResponse {
uint64 ret_val = 2;
}
/* GetNumDevices */
message VerifyConnectionRequest {
string name = 1;
}
message VerifyConnectionResponse {
string message = 1;
}
/* GetTemperature */
message GetTemperatureRequest {
uint32 dv_ind = 1;
@@ -71,15 +74,20 @@ message GetTemperatureResponse {
uint64 ret_val = 2;
}
// The greeting service definition.
service Rsmi {
/****************************************************************************/
/********************************** RdcAdmin Service ************************/
/****************************************************************************/
service RdcAdmin {
// RDC admin services
rpc VerifyConnection (VerifyConnectionRequest) returns (VerifyConnectionResponse) {}
// RSMI ID services
rpc GetNumDevices (GetNumDevicesRequest) returns(GetNumDevicesResponse) {}
// RSMI Physical Queries
rpc GetTemperature(GetTemperatureRequest) returns(GetTemperatureResponse) {}
rpc VerifyConnection (VerifyConnectionRequest)
returns (VerifyConnectionResponse) {}
}
/* GetNumDevices */
message VerifyConnectionRequest {
uint64 magic_num = 1;
}
message VerifyConnectionResponse {
uint64 echo_magic_num = 1;
}
+2 -1
Vedi File
@@ -71,7 +71,8 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include
"${PROTOB_OUT_DIR}" "${RSMI_INC_DIR}")
set(SERVER_SRC_LIST "${SRC_DIR}/rdc_rsmi_service.cc")
set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${SRC_DIR}/rdc_main.cc")
set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${SRC_DIR}/rdc_admin_service.cc")
set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${SRC_DIR}/rdc_server_main.cc")
set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${PROTOBUF_GENERATED_SRCS}")
message("SERVER_SRC_LIST=${SERVER_SRC_LIST}")
+39
Vedi File
@@ -0,0 +1,39 @@
/*
Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef SERVER_INCLUDE_RDC_RDC_ADMIN_SERVICE_H_
#define SERVER_INCLUDE_RDC_RDC_ADMIN_SERVICE_H_
#include "rdc.grpc.pb.h" // NOLINT
#include "rocm_smi/rocm_smi.h"
#include "rdc/rdc_admin_service.h"
class RDCAdminServiceImpl final : public ::rdc::RdcAdmin::Service {
public:
RDCAdminServiceImpl();
~RDCAdminServiceImpl();
::grpc::Status VerifyConnection(::grpc::ServerContext* context,
const rdc::VerifyConnectionRequest* request,
rdc::VerifyConnectionResponse* reply) override;
private:
};
#endif // SERVER_INCLUDE_RDC_RDC_ADMIN_SERVICE_H_
@@ -33,10 +33,6 @@ class RsmiServiceImpl final : public ::rdc::Rsmi::Service {
rsmi_status_t Initialize(uint64_t rsmi_init_flags = 0);
::grpc::Status VerifyConnection(::grpc::ServerContext* context,
const rdc::VerifyConnectionRequest* request,
rdc::VerifyConnectionResponse* reply) override;
::grpc::Status
GetNumDevices(::grpc::ServerContext* context,
const ::rdc::GetNumDevicesRequest* request,
@@ -19,8 +19,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef SERVER_INCLUDE_RDC_RDC_MAIN_H_
#define SERVER_INCLUDE_RDC_RDC_MAIN_H_
#ifndef SERVER_INCLUDE_RDC_RDC_SERVER_MAIN_H_
#define SERVER_INCLUDE_RDC_RDC_SERVER_MAIN_H_
#include <grpcpp/grpcpp.h>
@@ -28,6 +28,7 @@ THE SOFTWARE.
#include <memory>
#include "rdc/rdc_rsmi_service.h"
#include "rdc/rdc_admin_service.h"
class RDCServer {
public:
@@ -41,15 +42,22 @@ class RDCServer {
bool start_rsmi_service(void) const {return start_rsmi_service_;}
void set_start_rsmi_service(bool s) {start_rsmi_service_ = s;}
bool start_rdc_admin_service(void) const {return start_rdc_admin_service_;}
void set_start_rdc_admin_service(bool s) {start_rdc_admin_service_ = s;}
void ShutDown(void);
private:
void HandleSignal(int sig);
std::string server_address_;
bool start_rsmi_service_;
std::unique_ptr<::grpc::Server> server_;
bool start_rsmi_service_;
RsmiServiceImpl *rsmi_service_;
bool start_rdc_admin_service_;
RDCAdminServiceImpl *rdc_admin_service_;
};
#endif // SERVER_INCLUDE_RDC_RDC_MAIN_H_
#endif // SERVER_INCLUDE_RDC_RDC_SERVER_MAIN_H_
+1 -1
Vedi File
@@ -7,7 +7,7 @@ Description=Radeon Data Center Daemon (rdcd)
After=network.target
# Add any services that must be started before rdcd here
#After=
#After=
# Add any non-service units required by rdcd here
#Requires=
+49
Vedi File
@@ -0,0 +1,49 @@
/*
Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <assert.h>
#include <grpcpp/grpcpp.h>
#include <unistd.h>
#include <iostream>
#include <sstream>
#include <memory>
#include <string>
#include <csignal>
#include "rdc.grpc.pb.h" // NOLINT
#include "rdc/rdc_admin_service.h"
RDCAdminServiceImpl::RDCAdminServiceImpl() {
}
RDCAdminServiceImpl::~RDCAdminServiceImpl() {
}
::grpc::Status
RDCAdminServiceImpl::VerifyConnection(::grpc::ServerContext* context,
const rdc::VerifyConnectionRequest* request,
rdc::VerifyConnectionResponse* reply) {
(void)context; // Quiet warning for now
reply->set_echo_magic_num(request->magic_num());
return ::grpc::Status::OK;
}
@@ -61,15 +61,6 @@ RsmiServiceImpl::Initialize(uint64_t rsmi_init_flags) {
}
return rsmi_ret;
}
::grpc::Status
RsmiServiceImpl::VerifyConnection(::grpc::ServerContext* context,
const rdc::VerifyConnectionRequest* request,
rdc::VerifyConnectionResponse* reply) {
(void)context; // Quiet warning for now
std::string prefix("Hello ");
reply->set_message(prefix + request->name());
return ::grpc::Status::OK;
}
::grpc::Status
RsmiServiceImpl::GetNumDevices(::grpc::ServerContext* context,
@@ -36,7 +36,7 @@ THE SOFTWARE.
#include "rdc.grpc.pb.h" // NOLINT
#include "rocm_smi/rocm_smi.h"
#include "rdc/rdc_main.h"
#include "rdc/rdc_server_main.h"
#include "rdc/rdc_rsmi_service.h"
static bool sShutDownServer = false;
@@ -46,7 +46,7 @@ static const char *kRDCDHomeDir = "/";
static const char *kDaemonLockFile = "/var/run/rdcd.lock";
RDCServer::RDCServer() : server_address_("0.0.0.0:50051"),
rsmi_service_(nullptr) {
rsmi_service_(nullptr), rdc_admin_service_(nullptr) {
}
RDCServer::~RDCServer() {
@@ -66,6 +66,10 @@ RDCServer::Run() {
// Register services as the instances through which we'll communicate with
// clients. These are synchronous services.
if (start_rdc_admin_service()) {
rdc_admin_service_ = new RDCAdminServiceImpl();
builder.RegisterService(rdc_admin_service_);
}
if (start_rsmi_service()) {
rsmi_service_ = new RsmiServiceImpl();
@@ -125,6 +129,11 @@ RDCServer::ShutDown(void) {
delete rsmi_service_;
rsmi_service_ = nullptr;
}
if (rdc_admin_service_) {
delete rdc_admin_service_;
rdc_admin_service_ = nullptr;
}
}
static void * ProcessSignalLoop(void *server_ptr) {
@@ -268,6 +277,8 @@ int main(int argc, char** argv) {
// TODO(cfreehil): Eventually, set these by reading a config file
rdc_server.set_start_rsmi_service(true);
rdc_server.set_start_rdc_admin_service(true);
// rdc_server.set_secure_communications(false);
// rdc_server.set_address("0.0.0.0:50051")
+5 -6
Vedi File
@@ -41,9 +41,6 @@ endif ()
# Required Defines first:
set(RSMI_INC_DIR ${ROCM_DIR}/include)
set(RSMI_LIB_DIR ${ROCM_DIR}/lib)
message("")
message("Build Configuration:")
message("-----------BuildType: " ${CMAKE_BUILD_TYPE})
@@ -53,7 +50,6 @@ message("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR})
message("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR})
message("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib)
message("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin)
message("--------RSMI Lib Dir: " ${RSMI_LIB_DIR})
message("--------RSMI Inc Dir: " ${RSMI_INC_DIR})
message("")
@@ -66,8 +62,6 @@ include(utils)
## Verbose output.
set(CMAKE_VERBOSE_MAKEFILE on)
include_directories("${CMAKE_CURRENT_SOURCE_DIR}/../../client/include"
"${PROTOB_OUT_DIR}" "${RSMI_INC_DIR}")
set(EXAMPLE_SRC_LIST "${SRC_DIR}/rdc_client_test.cc")
message("EXAMPLE_SRC_LIST=${EXAMPLE_SRC_LIST}")
@@ -77,6 +71,11 @@ set(CLIENT_LIB_INC_LIST "${INC_DIR}/rdc_client.h")
set(TEST_CLIENT_EXE "rdc_test_client")
add_executable(${TEST_CLIENT_EXE} "${EXAMPLE_SRC_LIST}")
target_include_directories(${TEST_CLIENT_EXE} PRIVATE
"${CMAKE_CURRENT_SOURCE_DIR}/../../client/include"
"${RSMI_INC_DIR}")
target_link_libraries(${TEST_CLIENT_EXE} rdc_client)
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
@@ -31,7 +31,10 @@ THE SOFTWARE.
#define CHK_RET_STATUS(RET) \
if ((RET) != RDC_STATUS_SUCCESS) { \
std::cout << "rdc call returned error: " << (RET) << std::endl; \
const char *err_msg_str; \
(void)rdc_status_string((RET), &err_msg_str); \
std::cout << "rdc call returned error: " << (RET) << ":\"" << \
err_msg_str << "\"" << std::endl; \
}
#define CHK_RET_STATUS_CONT(RET) \
@@ -41,11 +44,8 @@ THE SOFTWARE.
}
int main(int argc, char** argv) {
(void)argc; // ignore for now
(void)argv; // ignore for now
rdc_status_t ret;
rdc_channel_t server;
rdc_channel_t server_ch;
uint64_t num_gpu;
int64_t temperature;
std::string serv_host("localhost");
@@ -61,27 +61,39 @@ int main(int argc, char** argv) {
std::cout << "Attempting to create channel to " << serv_host << ":" <<
serv_port << std::endl;
ret = rdc_channel_create(&server, serv_host.c_str(), serv_port.c_str(),
ret = rdc_channel_create(&server_ch, serv_host.c_str(), serv_port.c_str(),
false);
CHK_RET_STATUS(ret)
std::cout << "Successfully created channel" << std::endl;
std::cout << "Getting number of gpus at server..." << std::endl;
ret = rdc_num_gpus_get(server, &num_gpu);
grpc_connectivity_state ch_state;
ret = rdc_channel_state_get(server_ch, true, &ch_state);
CHK_RET_STATUS(ret)
std::cout << "Number of GPUs at server is " << num_gpu << std::endl;
std::cout << "Current channel state is " << ch_state << std::endl;
std::cout << "Verifying connection to server..." << std::endl;
ret = rdc_channel_connection_verify(server_ch);
CHK_RET_STATUS(ret)
if (ret == RDC_STATUS_SUCCESS) {
std::cout << "Verified connection to server." << std::endl;
}
std::cout << "Getting number of gpus at server..." << std::endl;
ret = rdc_num_gpus_get(server_ch, &num_gpu);
CHK_RET_STATUS(ret)
std::cout << "Number of GPUs at server is " << server_ch <<
num_gpu << std::endl;
for (uint32_t dv_ind = 0; dv_ind < num_gpu; ++dv_ind) {
std::cout << "Info for Device " << dv_ind << ":" << std::endl;
std::cout << "\tGetting temperature..." << std::endl;
ret = rdc_dev_temp_metric_get(server, dv_ind, RSMI_TEMP_TYPE_JUNCTION,
ret = rdc_dev_temp_metric_get(server_ch, dv_ind, RSMI_TEMP_TYPE_JUNCTION,
RSMI_TEMP_CURRENT, &temperature);
CHK_RET_STATUS_CONT(ret)
std::cout << "\t GPU " << dv_ind << " has a temperature of " <<
temperature << std::endl;
}
ret = rdc_channel_destroy(server);
ret = rdc_channel_destroy(server_ch);
CHK_RET_STATUS(ret)
std::cout << "Successfully destroyed channel to " << serv_host << ":" <<
serv_port << std::endl;