diff --git a/projects/rdc/CMakeLists.txt b/projects/rdc/CMakeLists.txt index 362f34e00b..b496826896 100755 --- a/projects/rdc/CMakeLists.txt +++ b/projects/rdc/CMakeLists.txt @@ -25,8 +25,8 @@ cmake_minimum_required(VERSION 3.5.0) # ROCM_DIR should be passed in via command line; these will be used # in sub-projects -set(RSMI_INC_DIR ${ROCM_DIR}/include) -set(RSMI_LIB_DIR ${ROCM_DIR}/lib) +set(RSMI_INC_DIR ${ROCM_DIR}/rocm_smi/include) +set(RSMI_LIB_DIR ${ROCM_DIR}/rocm_smi/lib) ## Set default module path if(NOT DEFINED CMAKE_MODULE_PATH) diff --git a/projects/rdc/client/CMakeLists.txt b/projects/rdc/client/CMakeLists.txt index 41f855f2d2..44db3b46ca 100755 --- a/projects/rdc/client/CMakeLists.txt +++ b/projects/rdc/client/CMakeLists.txt @@ -67,7 +67,7 @@ set(CLIENT_LIB "rdc_client") set(RDC "rdc") set(CLIENT_LIB_COMPONENT "lib${CLIENT_LIB}") set(SRC_DIR "${PROJECT_SOURCE_DIR}/client/src") -set(INC_DIR "${PROJECT_SOURCE_DIR}/client/include/rdc") +set(RDC_CLIENT_INC_DIR "${PROJECT_SOURCE_DIR}/client/include/rdc") ################# Determine the library version ######################### ## Setup the SO version based on git tags. @@ -119,27 +119,30 @@ set(CMAKE_VERBOSE_MAKEFILE on) file(GLOB PROTOBUF_GENERATED_INCLUDES "${PROTOB_OUT_DIR}/*.h") file(GLOB PROTOBUF_GENERATED_SRCS "${PROTOB_OUT_DIR}/*.cc") -include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include" - "${PROJECT_SOURCE_DIR}" - "${PROTOB_OUT_DIR}" "${RSMI_INC_DIR}") set(CLIENT_LIB_SRC_LIST "${SRC_DIR}/rdc_client.cc") -set(CLIENT_LIB_SRC_LIST ${CLIENT_LIB_SRC_LIST} "${SRC_DIR}/rdc_main.cc") +set(CLIENT_LIB_SRC_LIST ${CLIENT_LIB_SRC_LIST} "${SRC_DIR}/rdc_client_main.cc") set(CLIENT_LIB_SRC_LIST ${CLIENT_LIB_SRC_LIST} "${PROTOBUF_GENERATED_SRCS}") set(CLIENT_LIB_SRC_LIST ${CLIENT_LIB_SRC_LIST} "${PROJECT_SOURCE_DIR}/common/rdc_utils.cc") message("CLIENT_LIB_SRC_LIST=${CLIENT_LIB_SRC_LIST}") -set(CLIENT_LIB_INC_LIST "${INC_DIR}/rdc_client.h") -set(CLIENT_LIB_INC_LIST ${CLIENT_LIB_INC_LIST} "${INC_DIR}/rdc_exception.h") -set(CLIENT_LIB_INC_LIST ${CLIENT_LIB_INC_LIST} "${INC_DIR}/rdc_main.h") +set(CLIENT_LIB_INC_LIST "${RDC_CLIENT_INC_DIR}/rdc_client.h") set(CLIENT_LIB_INC_LIST ${CLIENT_LIB_INC_LIST} - "${PROJECT_SOURCE_DIR}/common/rdc_utils.h") + "${RDC_CLIENT_INC_DIR}/rdc_exception.h") +set(CLIENT_LIB_INC_LIST ${CLIENT_LIB_INC_LIST} + "${RDC_CLIENT_INC_DIR}/rdc_client_main.h") +set(CLIENT_LIB_INC_LIST ${CLIENT_LIB_INC_LIST} + "${PROJECT_SOURCE_DIR}/common/rdc_utils.h") add_library(${CLIENT_LIB} SHARED ${CLIENT_LIB_SRC_LIST} ${CLIENT_LIB_INC_LIST}) target_link_libraries(${CLIENT_LIB} pthread rt grpc grpc++ grpc++_reflection dl protobuf) -target_include_directories(${CLIENT_LIB} PUBLIC ${INC_DIR}) - +target_include_directories(${CLIENT_LIB} PRIVATE + "${PROJECT_SOURCE_DIR}" + "${PROJECT_SOURCE_DIR}/include" + "${CMAKE_CURRENT_SOURCE_DIR}/include" + "${PROTOB_OUT_DIR}" + "${RSMI_INC_DIR}") # TODO: set the properties for the library once we have one ## Set the VERSION and SOVERSION values set_property(TARGET ${CLIENT_LIB} PROPERTY diff --git a/projects/rdc/client/include/rdc/rdc_client.h b/projects/rdc/client/include/rdc/rdc_client.h index d05f73d99a..981c249a29 100755 --- a/projects/rdc/client/include/rdc/rdc_client.h +++ b/projects/rdc/client/include/rdc/rdc_client.h @@ -24,6 +24,8 @@ THE SOFTWARE. #ifndef CLIENT_INCLUDE_RDC_RDC_CLIENT_H_ #define CLIENT_INCLUDE_RDC_RDC_CLIENT_H_ +#include + #include #include #include "rocm_smi/rocm_smi.h" @@ -192,6 +194,64 @@ typedef uintptr_t rdc_channel_t; #define RDC_DEFAULT_SERVER_PORT 50051 #define RDC_DEFAULT_SERVER_IP "localhost" +/*****************************************************************************/ +/** @defgroup RDCAdmin RDC Administration Functions + * These administrative functions are used to monitor and control, for + * example RDC connectivity. + * @{ + */ + +/** + * @brief Check the connection status of a channel + * + * @details Given an ::rdc_channel_t @p channel and a boolean @p + * try_to_connect, this function will return the grpc_connectivity_state for + * that channel + * + * @p channel[in] The channel for which the status will be given + * + * @param[in] try_to_connect If the channel is currently IDLE, if the argument + * is true, transition to CONNECTING. + * + * @param[inout] state A pointer to caller provided memory to which an + * the grpc_connectivity_state will be written. grpc_connectivity_state has + * the following possible values: + * GRPC_CHANNEL_IDLE channel is idle + * GRPC_CHANNEL_CONNECTING channel is connecting + * GRPC_CHANNEL_READY channel is ready for work + * GRPC_CHANNEL_TRANSIENT_FAILURE channel has seen a failure but expects to + * recover + * GRPC_CHANNEL_SHUTDOWN channel has seen a failure that it cannot + * recover from + * + * @retval ::RDC_STATUS_SUCCESS is returned upon successful call. + * + */ +rdc_status_t +rdc_channel_state_get(rdc_channel_t channel, bool try_to_connect, + grpc_connectivity_state *state); + + +/** + * @brief Verify a channel's connection to the server + * + * @details Given an ::rdc_channel_t @p channel, this function will send a + * random number to the server associated with @p channel. The server will send + * the number back. Upon receiving the returned message from the server, the + * number sent to the server is compared to the number received from the + * server. If the 2 numbers are the same, the connection is verified. + * Otherwise, an appropriate error code is returned. + * + * @p channel[in] The channel for which the connection will be verified + * + * @retval ::RDC_STATUS_SUCCESS is returned upon successful call. + * + */ +rdc_status_t +rdc_channel_connection_verify(rdc_channel_t channel); + +/** @} */ // end of RDCAdmin + /*****************************************************************************/ /** @defgroup InitShutAdmin Initialization and Shutdown * These functions are used for initialization of RDC and clean up when @@ -216,7 +276,7 @@ typedef uintptr_t rdc_channel_t; * * @param[in] port A pointer to string containing the port on which the * RDC server is listening - * + * * @param[in] secure A bool indicating whether SSL should be used for * communications (not currently supported) * diff --git a/projects/rdc/client/include/rdc/rdc_main.h b/projects/rdc/client/include/rdc/rdc_client_main.h similarity index 76% rename from projects/rdc/client/include/rdc/rdc_main.h rename to projects/rdc/client/include/rdc/rdc_client_main.h index 8c2fa88b3b..1b7860b46c 100755 --- a/projects/rdc/client/include/rdc/rdc_main.h +++ b/projects/rdc/client/include/rdc/rdc_client_main.h @@ -21,8 +21,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef CLIENT_INCLUDE_RDC_RDC_MAIN_H_ -#define CLIENT_INCLUDE_RDC_RDC_MAIN_H_ +#ifndef CLIENT_INCLUDE_RDC_RDC_CLIENT_MAIN_H_ +#define CLIENT_INCLUDE_RDC_RDC_CLIENT_MAIN_H_ #include #include @@ -48,15 +48,21 @@ class RDCChannel { std::string server_ip(void) const {return server_ip_;} std::string server_port(void) const {return server_port_;} bool secure_channel(void) const {return secure_channel_;} - std::shared_ptr<::rdc::Rsmi::Stub> stub(void) const {return stub_;} + std::shared_ptr<::rdc::Rsmi::Stub> rsmi_stub(void) const {return rsmi_stub_;} + std::shared_ptr<::rdc::RdcAdmin::Stub> rdc_admin_stub(void) const { + return rdc_admin_stub_;} + std::shared_ptr const channel(void) {return channel_;} + private: std::string server_ip_; std::string server_port_; bool secure_channel_; - std::shared_ptr<::rdc::Rsmi::Stub> stub_; + std::shared_ptr<::rdc::Rsmi::Stub> rsmi_stub_; + std::shared_ptr<::rdc::RdcAdmin::Stub> rdc_admin_stub_; + std::shared_ptr channel_; }; } // namespace rdc } // namespace amd -#endif // CLIENT_INCLUDE_RDC_RDC_MAIN_H_ +#endif // CLIENT_INCLUDE_RDC_RDC_CLIENT_MAIN_H_ diff --git a/projects/rdc/client/src/rdc_client.cc b/projects/rdc/client/src/rdc_client.cc index 643534922c..b3ab41efd2 100755 --- a/projects/rdc/client/src/rdc_client.cc +++ b/projects/rdc/client/src/rdc_client.cc @@ -1,5 +1,5 @@ /* -Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2019 - Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -22,10 +22,11 @@ THE SOFTWARE. #include +#include #include #include -#include "rdc/rdc_main.h" +#include "rdc/rdc_client_main.h" #include "rdc/rdc_client.h" #include "common/rdc_utils.h" #include "rdc/rdc_exception.h" @@ -113,6 +114,45 @@ rdc_channel_create(rdc_channel_t *channel, const char *ip, CATCH } +rdc_status_t +rdc_channel_state_get(rdc_channel_t channel, bool try_to_connect, + grpc_connectivity_state *state) { + TRY + CHK_PTR_ARG(state) + UINTPTR_TO_RDC_CHAN(channel) + + *state = ch->channel()->GetState(try_to_connect); + return RDC_STATUS_SUCCESS; + + CATCH +} + +rdc_status_t +rdc_channel_connection_verify(rdc_channel_t channel) { + TRY + UINTPTR_TO_RDC_CHAN(channel) + + ::rdc::VerifyConnectionResponse resp; + ::rdc::VerifyConnectionRequest req; + ::grpc::ClientContext context; + unsigned int seed = time(NULL); + + req.set_magic_num(static_cast(rand_r(&seed))); + ::grpc::Status status = + ch->rdc_admin_stub()->VerifyConnection(&context, req, &resp); + + if (!status.ok()) { + return amd::rdc::GrpcErrorToRdcError(status.error_code()); + } + + if (resp.echo_magic_num() != req.magic_num()) { + return RDC_STATUS_GRPC_DATA_LOSS; + } + + return RDC_STATUS_SUCCESS; + + CATCH +} rdc_status_t rdc_channel_destroy(rdc_channel_t channel) { @@ -135,7 +175,8 @@ rdc_num_gpus_get(rdc_channel_t channel, uint64_t *num_gpu) { ::rdc::GetNumDevicesResponse resp; ::rdc::GetNumDevicesRequest empty; ::grpc::ClientContext context; - ::grpc::Status status = ch->stub()->GetNumDevices(&context, empty, &resp); + ::grpc::Status status = + ch->rsmi_stub()->GetNumDevices(&context, empty, &resp); if (!status.ok()) { return amd::rdc::GrpcErrorToRdcError(status.error_code()); @@ -171,7 +212,8 @@ rdc_dev_temp_metric_get(rdc_channel_t channel, uint32_t dv_ind, in_args.set_dv_ind(dv_ind); in_args.set_sensor_type(sensor_type); - ::grpc::Status status = ch->stub()->GetTemperature(&context, in_args, &resp); + ::grpc::Status status = + ch->rsmi_stub()->GetTemperature(&context, in_args, &resp); if (!status.ok()) { return ::amd::rdc::GrpcErrorToRdcError(status.error_code()); diff --git a/projects/rdc/client/src/rdc_main.cc b/projects/rdc/client/src/rdc_client_main.cc similarity index 84% rename from projects/rdc/client/src/rdc_main.cc rename to projects/rdc/client/src/rdc_client_main.cc index 0a4164a616..a40277f478 100755 --- a/projects/rdc/client/src/rdc_main.cc +++ b/projects/rdc/client/src/rdc_client_main.cc @@ -27,7 +27,7 @@ THE SOFTWARE. #include #include "rdc.grpc.pb.h" // NOLINT -#include "rdc/rdc_main.h" +#include "rdc/rdc_client_main.h" #include "rdc/rdc_client.h" namespace amd { @@ -48,21 +48,25 @@ RDCChannel::Initialize(void) { std::string addr_str = server_ip() + ":"; addr_str += server_port(); - std::shared_ptr channel; - if (secure_channel_) { // Not yet supported return RDC_STATUS_GRPC_UNIMPLEMENTED; } else { - channel = ::grpc::CreateChannel(addr_str, + channel_ = ::grpc::CreateChannel(addr_str, grpc::InsecureChannelCredentials()); } - stub_ = ::rdc::Rsmi::NewStub(channel); - - if (stub_ == nullptr) { + rsmi_stub_ = ::rdc::Rsmi::NewStub(channel_); + if (rsmi_stub_ == nullptr) { return RDC_STATUS_GRPC_RESOURCE_EXHAUSTED; } + + rdc_admin_stub_ = ::rdc::RdcAdmin::NewStub(channel_); + if (rdc_admin_stub_ == nullptr) { + return RDC_STATUS_GRPC_RESOURCE_EXHAUSTED; + } + + // Test to see if we can connect to server; if not, return err. return RDC_STATUS_SUCCESS; } diff --git a/projects/rdc/common/rdc_utils.cc b/projects/rdc/common/rdc_utils.cc index aeb3dc0679..051e61dd34 100755 --- a/projects/rdc/common/rdc_utils.cc +++ b/projects/rdc/common/rdc_utils.cc @@ -34,7 +34,7 @@ rdc_status_t GrpcErrorToRdcError(grpc::StatusCode grpc_err) { uint32_t rdc_grpc_base_int = static_cast(RDC_STATUS_GRPC_ERR_FIRST); uint32_t rdc_err_int = grpc_err_int + rdc_grpc_base_int; - + return static_cast(rdc_err_int); } diff --git a/projects/rdc/docs/RDC_Manual.pdf b/projects/rdc/docs/RDC_Manual.pdf index 656627a628..c86885a51d 100644 Binary files a/projects/rdc/docs/RDC_Manual.pdf and b/projects/rdc/docs/RDC_Manual.pdf differ diff --git a/projects/rdc/protos/rdc.proto b/projects/rdc/protos/rdc.proto index a857ecfd93..ca9a2ea70d 100755 --- a/projects/rdc/protos/rdc.proto +++ b/projects/rdc/protos/rdc.proto @@ -27,7 +27,18 @@ syntax = "proto3"; // option objc_class_prefix = "HLW"; package rdc; - + +/****************************************************************************/ +/********************************** Rsmi Service ****************************/ +/****************************************************************************/ +service Rsmi { + // RSMI ID services + rpc GetNumDevices (GetNumDevicesRequest) returns(GetNumDevicesResponse) {} + + // RSMI Physical Queries + rpc GetTemperature(GetTemperatureRequest) returns(GetTemperatureResponse){} +} + // rsmi_num_monitor_devices() message GetNumDevicesRequest { } @@ -36,14 +47,6 @@ message GetNumDevicesResponse { uint64 ret_val = 2; } -/* GetNumDevices */ -message VerifyConnectionRequest { - string name = 1; -} -message VerifyConnectionResponse { - string message = 1; -} - /* GetTemperature */ message GetTemperatureRequest { uint32 dv_ind = 1; @@ -71,15 +74,20 @@ message GetTemperatureResponse { uint64 ret_val = 2; } -// The greeting service definition. -service Rsmi { +/****************************************************************************/ +/********************************** RdcAdmin Service ************************/ +/****************************************************************************/ +service RdcAdmin { // RDC admin services - rpc VerifyConnection (VerifyConnectionRequest) returns (VerifyConnectionResponse) {} - - // RSMI ID services - rpc GetNumDevices (GetNumDevicesRequest) returns(GetNumDevicesResponse) {} - - // RSMI Physical Queries - rpc GetTemperature(GetTemperatureRequest) returns(GetTemperatureResponse) {} + rpc VerifyConnection (VerifyConnectionRequest) + returns (VerifyConnectionResponse) {} +} + +/* GetNumDevices */ +message VerifyConnectionRequest { + uint64 magic_num = 1; +} +message VerifyConnectionResponse { + uint64 echo_magic_num = 1; } diff --git a/projects/rdc/server/CMakeLists.txt b/projects/rdc/server/CMakeLists.txt index 10c94fecce..221bdfe840 100755 --- a/projects/rdc/server/CMakeLists.txt +++ b/projects/rdc/server/CMakeLists.txt @@ -71,7 +71,8 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include "${PROTOB_OUT_DIR}" "${RSMI_INC_DIR}") set(SERVER_SRC_LIST "${SRC_DIR}/rdc_rsmi_service.cc") -set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${SRC_DIR}/rdc_main.cc") +set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${SRC_DIR}/rdc_admin_service.cc") +set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${SRC_DIR}/rdc_server_main.cc") set(SERVER_SRC_LIST ${SERVER_SRC_LIST} "${PROTOBUF_GENERATED_SRCS}") message("SERVER_SRC_LIST=${SERVER_SRC_LIST}") diff --git a/projects/rdc/server/include/rdc/rdc_admin_service.h b/projects/rdc/server/include/rdc/rdc_admin_service.h new file mode 100755 index 0000000000..c0f8d0c1ce --- /dev/null +++ b/projects/rdc/server/include/rdc/rdc_admin_service.h @@ -0,0 +1,39 @@ +/* +Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef SERVER_INCLUDE_RDC_RDC_ADMIN_SERVICE_H_ +#define SERVER_INCLUDE_RDC_RDC_ADMIN_SERVICE_H_ + +#include "rdc.grpc.pb.h" // NOLINT +#include "rocm_smi/rocm_smi.h" +#include "rdc/rdc_admin_service.h" + +class RDCAdminServiceImpl final : public ::rdc::RdcAdmin::Service { + public: + RDCAdminServiceImpl(); + ~RDCAdminServiceImpl(); + ::grpc::Status VerifyConnection(::grpc::ServerContext* context, + const rdc::VerifyConnectionRequest* request, + rdc::VerifyConnectionResponse* reply) override; + private: +}; + +#endif // SERVER_INCLUDE_RDC_RDC_ADMIN_SERVICE_H_ diff --git a/projects/rdc/server/include/rdc/rdc_rsmi_service.h b/projects/rdc/server/include/rdc/rdc_rsmi_service.h index c5b8a2215d..a420cfe311 100755 --- a/projects/rdc/server/include/rdc/rdc_rsmi_service.h +++ b/projects/rdc/server/include/rdc/rdc_rsmi_service.h @@ -33,10 +33,6 @@ class RsmiServiceImpl final : public ::rdc::Rsmi::Service { rsmi_status_t Initialize(uint64_t rsmi_init_flags = 0); - ::grpc::Status VerifyConnection(::grpc::ServerContext* context, - const rdc::VerifyConnectionRequest* request, - rdc::VerifyConnectionResponse* reply) override; - ::grpc::Status GetNumDevices(::grpc::ServerContext* context, const ::rdc::GetNumDevicesRequest* request, diff --git a/projects/rdc/server/include/rdc/rdc_main.h b/projects/rdc/server/include/rdc/rdc_server_main.h similarity index 80% rename from projects/rdc/server/include/rdc/rdc_main.h rename to projects/rdc/server/include/rdc/rdc_server_main.h index 98c4d347c7..5af7a32e0b 100755 --- a/projects/rdc/server/include/rdc/rdc_main.h +++ b/projects/rdc/server/include/rdc/rdc_server_main.h @@ -19,8 +19,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef SERVER_INCLUDE_RDC_RDC_MAIN_H_ -#define SERVER_INCLUDE_RDC_RDC_MAIN_H_ +#ifndef SERVER_INCLUDE_RDC_RDC_SERVER_MAIN_H_ +#define SERVER_INCLUDE_RDC_RDC_SERVER_MAIN_H_ #include @@ -28,6 +28,7 @@ THE SOFTWARE. #include #include "rdc/rdc_rsmi_service.h" +#include "rdc/rdc_admin_service.h" class RDCServer { public: @@ -41,15 +42,22 @@ class RDCServer { bool start_rsmi_service(void) const {return start_rsmi_service_;} void set_start_rsmi_service(bool s) {start_rsmi_service_ = s;} + bool start_rdc_admin_service(void) const {return start_rdc_admin_service_;} + void set_start_rdc_admin_service(bool s) {start_rdc_admin_service_ = s;} + void ShutDown(void); private: void HandleSignal(int sig); std::string server_address_; - bool start_rsmi_service_; std::unique_ptr<::grpc::Server> server_; + bool start_rsmi_service_; RsmiServiceImpl *rsmi_service_; + + bool start_rdc_admin_service_; + RDCAdminServiceImpl *rdc_admin_service_; }; -#endif // SERVER_INCLUDE_RDC_RDC_MAIN_H_ +#endif // SERVER_INCLUDE_RDC_RDC_SERVER_MAIN_H_ + diff --git a/projects/rdc/server/rdc.service b/projects/rdc/server/rdc.service index 218a7c4356..713f8d10c8 100755 --- a/projects/rdc/server/rdc.service +++ b/projects/rdc/server/rdc.service @@ -7,7 +7,7 @@ Description=Radeon Data Center Daemon (rdcd) After=network.target # Add any services that must be started before rdcd here -#After= +#After= # Add any non-service units required by rdcd here #Requires= diff --git a/projects/rdc/server/src/rdc_admin_service.cc b/projects/rdc/server/src/rdc_admin_service.cc new file mode 100755 index 0000000000..af4e3aa917 --- /dev/null +++ b/projects/rdc/server/src/rdc_admin_service.cc @@ -0,0 +1,49 @@ + +/* +Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "rdc.grpc.pb.h" // NOLINT +#include "rdc/rdc_admin_service.h" + +RDCAdminServiceImpl::RDCAdminServiceImpl() { +} + +RDCAdminServiceImpl::~RDCAdminServiceImpl() { +} +::grpc::Status +RDCAdminServiceImpl::VerifyConnection(::grpc::ServerContext* context, + const rdc::VerifyConnectionRequest* request, + rdc::VerifyConnectionResponse* reply) { + (void)context; // Quiet warning for now + + reply->set_echo_magic_num(request->magic_num()); + return ::grpc::Status::OK; +} diff --git a/projects/rdc/server/src/rdc_rsmi_service.cc b/projects/rdc/server/src/rdc_rsmi_service.cc index 58c2200daa..1c75a22ee5 100755 --- a/projects/rdc/server/src/rdc_rsmi_service.cc +++ b/projects/rdc/server/src/rdc_rsmi_service.cc @@ -61,15 +61,6 @@ RsmiServiceImpl::Initialize(uint64_t rsmi_init_flags) { } return rsmi_ret; } -::grpc::Status -RsmiServiceImpl::VerifyConnection(::grpc::ServerContext* context, - const rdc::VerifyConnectionRequest* request, - rdc::VerifyConnectionResponse* reply) { - (void)context; // Quiet warning for now - std::string prefix("Hello "); - reply->set_message(prefix + request->name()); - return ::grpc::Status::OK; -} ::grpc::Status RsmiServiceImpl::GetNumDevices(::grpc::ServerContext* context, diff --git a/projects/rdc/server/src/rdc_main.cc b/projects/rdc/server/src/rdc_server_main.cc similarity index 95% rename from projects/rdc/server/src/rdc_main.cc rename to projects/rdc/server/src/rdc_server_main.cc index 00fe2c56cb..0f4240fe23 100755 --- a/projects/rdc/server/src/rdc_main.cc +++ b/projects/rdc/server/src/rdc_server_main.cc @@ -36,7 +36,7 @@ THE SOFTWARE. #include "rdc.grpc.pb.h" // NOLINT #include "rocm_smi/rocm_smi.h" -#include "rdc/rdc_main.h" +#include "rdc/rdc_server_main.h" #include "rdc/rdc_rsmi_service.h" static bool sShutDownServer = false; @@ -46,7 +46,7 @@ static const char *kRDCDHomeDir = "/"; static const char *kDaemonLockFile = "/var/run/rdcd.lock"; RDCServer::RDCServer() : server_address_("0.0.0.0:50051"), - rsmi_service_(nullptr) { + rsmi_service_(nullptr), rdc_admin_service_(nullptr) { } RDCServer::~RDCServer() { @@ -66,6 +66,10 @@ RDCServer::Run() { // Register services as the instances through which we'll communicate with // clients. These are synchronous services. + if (start_rdc_admin_service()) { + rdc_admin_service_ = new RDCAdminServiceImpl(); + builder.RegisterService(rdc_admin_service_); + } if (start_rsmi_service()) { rsmi_service_ = new RsmiServiceImpl(); @@ -125,6 +129,11 @@ RDCServer::ShutDown(void) { delete rsmi_service_; rsmi_service_ = nullptr; } + + if (rdc_admin_service_) { + delete rdc_admin_service_; + rdc_admin_service_ = nullptr; + } } static void * ProcessSignalLoop(void *server_ptr) { @@ -268,6 +277,8 @@ int main(int argc, char** argv) { // TODO(cfreehil): Eventually, set these by reading a config file rdc_server.set_start_rsmi_service(true); + rdc_server.set_start_rdc_admin_service(true); + // rdc_server.set_secure_communications(false); // rdc_server.set_address("0.0.0.0:50051") diff --git a/projects/rdc/tests/example/CMakeLists.txt b/projects/rdc/tests/example/CMakeLists.txt index 149a0718b7..104be28fa2 100755 --- a/projects/rdc/tests/example/CMakeLists.txt +++ b/projects/rdc/tests/example/CMakeLists.txt @@ -41,9 +41,6 @@ endif () # Required Defines first: -set(RSMI_INC_DIR ${ROCM_DIR}/include) -set(RSMI_LIB_DIR ${ROCM_DIR}/lib) - message("") message("Build Configuration:") message("-----------BuildType: " ${CMAKE_BUILD_TYPE}) @@ -53,7 +50,6 @@ message("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR}) message("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR}) message("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib) message("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin) -message("--------RSMI Lib Dir: " ${RSMI_LIB_DIR}) message("--------RSMI Inc Dir: " ${RSMI_INC_DIR}) message("") @@ -66,8 +62,6 @@ include(utils) ## Verbose output. set(CMAKE_VERBOSE_MAKEFILE on) -include_directories("${CMAKE_CURRENT_SOURCE_DIR}/../../client/include" - "${PROTOB_OUT_DIR}" "${RSMI_INC_DIR}") set(EXAMPLE_SRC_LIST "${SRC_DIR}/rdc_client_test.cc") message("EXAMPLE_SRC_LIST=${EXAMPLE_SRC_LIST}") @@ -77,6 +71,11 @@ set(CLIENT_LIB_INC_LIST "${INC_DIR}/rdc_client.h") set(TEST_CLIENT_EXE "rdc_test_client") add_executable(${TEST_CLIENT_EXE} "${EXAMPLE_SRC_LIST}") + +target_include_directories(${TEST_CLIENT_EXE} PRIVATE + "${CMAKE_CURRENT_SOURCE_DIR}/../../client/include" + "${RSMI_INC_DIR}") + target_link_libraries(${TEST_CLIENT_EXE} rdc_client) message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") diff --git a/projects/rdc/tests/example/rdc_client_test.cc b/projects/rdc/tests/example/rdc_client_test.cc index b4eca57d0f..c81dc0d853 100755 --- a/projects/rdc/tests/example/rdc_client_test.cc +++ b/projects/rdc/tests/example/rdc_client_test.cc @@ -31,7 +31,10 @@ THE SOFTWARE. #define CHK_RET_STATUS(RET) \ if ((RET) != RDC_STATUS_SUCCESS) { \ - std::cout << "rdc call returned error: " << (RET) << std::endl; \ + const char *err_msg_str; \ + (void)rdc_status_string((RET), &err_msg_str); \ + std::cout << "rdc call returned error: " << (RET) << ":\"" << \ + err_msg_str << "\"" << std::endl; \ } #define CHK_RET_STATUS_CONT(RET) \ @@ -41,11 +44,8 @@ THE SOFTWARE. } int main(int argc, char** argv) { - (void)argc; // ignore for now - (void)argv; // ignore for now - rdc_status_t ret; - rdc_channel_t server; + rdc_channel_t server_ch; uint64_t num_gpu; int64_t temperature; std::string serv_host("localhost"); @@ -61,27 +61,39 @@ int main(int argc, char** argv) { std::cout << "Attempting to create channel to " << serv_host << ":" << serv_port << std::endl; - ret = rdc_channel_create(&server, serv_host.c_str(), serv_port.c_str(), + ret = rdc_channel_create(&server_ch, serv_host.c_str(), serv_port.c_str(), false); CHK_RET_STATUS(ret) std::cout << "Successfully created channel" << std::endl; - std::cout << "Getting number of gpus at server..." << std::endl; - ret = rdc_num_gpus_get(server, &num_gpu); + grpc_connectivity_state ch_state; + ret = rdc_channel_state_get(server_ch, true, &ch_state); CHK_RET_STATUS(ret) - std::cout << "Number of GPUs at server is " << num_gpu << std::endl; + std::cout << "Current channel state is " << ch_state << std::endl; + + std::cout << "Verifying connection to server..." << std::endl; + ret = rdc_channel_connection_verify(server_ch); + CHK_RET_STATUS(ret) + if (ret == RDC_STATUS_SUCCESS) { + std::cout << "Verified connection to server." << std::endl; + } + std::cout << "Getting number of gpus at server..." << std::endl; + ret = rdc_num_gpus_get(server_ch, &num_gpu); + CHK_RET_STATUS(ret) + std::cout << "Number of GPUs at server is " << server_ch << + num_gpu << std::endl; for (uint32_t dv_ind = 0; dv_ind < num_gpu; ++dv_ind) { std::cout << "Info for Device " << dv_ind << ":" << std::endl; std::cout << "\tGetting temperature..." << std::endl; - ret = rdc_dev_temp_metric_get(server, dv_ind, RSMI_TEMP_TYPE_JUNCTION, + ret = rdc_dev_temp_metric_get(server_ch, dv_ind, RSMI_TEMP_TYPE_JUNCTION, RSMI_TEMP_CURRENT, &temperature); CHK_RET_STATUS_CONT(ret) std::cout << "\t GPU " << dv_ind << " has a temperature of " << temperature << std::endl; } - ret = rdc_channel_destroy(server); + ret = rdc_channel_destroy(server_ch); CHK_RET_STATUS(ret) std::cout << "Successfully destroyed channel to " << serv_host << ":" << serv_port << std::endl;