From 52bb0d6466aa90238a7618a7ed971b8ce619bb80 Mon Sep 17 00:00:00 2001 From: stali Date: Tue, 17 Dec 2024 14:29:05 +0800 Subject: [PATCH] Enable RDC link Status feature 1.add link status APIs 2.Add link status example for link status API usage [ROCm/rdc commit: 29b6699b621395aa5fa74faa8fa8d16ed329a85c] --- projects/rdc/example/CMakeLists.txt | 6 + projects/rdc/example/topologylink_example.cc | 10 +- projects/rdc/include/rdc/rdc.h | 2 +- .../include/rdc_lib/impl/RdcEmbeddedHandler.h | 3 +- projects/rdc/protos/rdc.proto | 28 +++++ .../rdc_libs/rdc/src/RdcTopologyLinkImpl.cc | 42 +++++++ .../rdc_client/src/RdcStandaloneHandler.cc | 45 +++++--- projects/rdc/rdci/CMakeLists.txt | 1 + .../rdci/include/RdciTopologyLinkSubSystem.h | 2 +- .../include/RdciXgmiLinkStatusSubSystem.h | 43 ++++++++ .../rdc/rdci/src/RdciTopologyLinkSubSystem.cc | 8 +- .../rdci/src/RdciXgmiLinkStatusSubSystem.cc | 104 ++++++++++++++++++ projects/rdc/rdci/src/rdci.cc | 5 +- .../rdc/server/include/rdc/rdc_api_service.h | 5 +- projects/rdc/server/src/rdc_api_service.cc | 33 ++++++ 15 files changed, 315 insertions(+), 22 deletions(-) create mode 100644 projects/rdc/rdci/include/RdciXgmiLinkStatusSubSystem.h create mode 100644 projects/rdc/rdci/src/RdciXgmiLinkStatusSubSystem.cc diff --git a/projects/rdc/example/CMakeLists.txt b/projects/rdc/example/CMakeLists.txt index c3cae14c10..68ebb9bb83 100755 --- a/projects/rdc/example/CMakeLists.txt +++ b/projects/rdc/example/CMakeLists.txt @@ -132,6 +132,12 @@ set(CONFIG_EXAMPLE_EXE "config") add_executable(${CONFIG_EXAMPLE_EXE} "${CONFIG_EXAMPLE_SRC_LIST}") target_link_libraries(${CONFIG_EXAMPLE_EXE} pthread dl rdc_bootstrap) +set(TOPOLOGYLINK_EXAMPLE_SRC_LIST "topologylink_example.cc") +cmake_print_variables(TOPOLOGYLINK_EXAMPLE_SRC_LIST) +set(TOPOLOGYLINK_EXAMPLE_EXE "topologylink") +add_executable(${TOPOLOGYLINK_EXAMPLE_EXE} "${TOPOLOGYLINK_EXAMPLE_SRC_LIST}") +target_link_libraries(${TOPOLOGYLINK_EXAMPLE_EXE} pthread dl rdc_bootstrap) + message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") message(" Finished Cmake Example ") message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") diff --git a/projects/rdc/example/topologylink_example.cc b/projects/rdc/example/topologylink_example.cc index b313bd34f6..cbe26e49ac 100644 --- a/projects/rdc/example/topologylink_example.cc +++ b/projects/rdc/example/topologylink_example.cc @@ -119,10 +119,18 @@ int main() { << "max_bandwidth: " << std::to_string(topo.link_infos[i].max_bandwidth) << "\n" << "hops: " << std::to_string(topo.link_infos[i].hops) << "\n" << "link_type: " << topology_link_type_to_str(topo.link_infos[i].link_type) << "\n" - << "is_p2p_accessible: " << std::to_string(topo.link_infos[i].is_p2p_accessible) << "\n" + << "is_p2p_accessible: " << std::to_string(topo.link_infos[i].is_p2p_accessible) + << "\n" << std::endl; } + rdc_link_status_t link_status; + result = rdc_link_status_get(rdc_handle, &link_status); + if (result != RDC_ST_OK) { + std::cout << "Error clear topology, Return: " << rdc_status_string(result) << std::endl; + goto cleanup; + } + //... clean up cleanup: std::cout << "Cleaning up.\n"; diff --git a/projects/rdc/include/rdc/rdc.h b/projects/rdc/include/rdc/rdc.h index 088d2a3f4b..f0604105f7 100644 --- a/projects/rdc/include/rdc/rdc.h +++ b/projects/rdc/include/rdc/rdc.h @@ -654,7 +654,7 @@ typedef enum { #define RDC_MAX_NUM_OF_LINKS 16 typedef struct { - int32_t gpu_index; + uint32_t gpu_index; uint32_t num_of_links; // The size of the array link_states rdc_topology_link_type_t link_types; // XGMI, PCIe, and so on rdc_link_state_t link_states[RDC_MAX_NUM_OF_LINKS]; diff --git a/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h b/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h index 06c2342d9c..d82d529d2f 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h +++ b/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h @@ -147,9 +147,10 @@ class RdcEmbeddedHandler final : public RdcHandler { RdcWatchTablePtr watch_table_; RdcMetricsUpdaterPtr metrics_updater_; RdcPolicyPtr policy_; + RdcTopologyLinkPtr topologylink_; RdcConfigSettingsPtr config_handler_; std::future updater_; - RdcTopologyLinkPtr topologylink_; + }; } // namespace rdc diff --git a/projects/rdc/protos/rdc.proto b/projects/rdc/protos/rdc.proto index 5907143530..eb6f2732ed 100755 --- a/projects/rdc/protos/rdc.proto +++ b/projects/rdc/protos/rdc.proto @@ -217,6 +217,8 @@ service RdcAPI { //Clear the setting rpc ClearConfig(ClearConfigRequest) returns (ClearConfigResponse) {} + // rdc_status_t GetLinkStatus() + rpc GetLinkStatus(Empty) returns (GetLinkStatusResponse) {} } message Empty { @@ -729,6 +731,32 @@ message GetTopologyResponse { uint32 status = 1; Topology toppology = 2; } +message GpuLinkStatus{ + uint32 gpu_index = 1; + uint32 num_of_links = 2; + enum LinkTypes { + RDC_IOLINK_TYPE_UNDEFINED = 0; + RDC_IOLINK_TYPE_PCIEXPRESS = 1; + RDC_IOLINK_TYPE_XGMI = 2; + RDCI_IOLINK_TYPE_NUMIOLINKTYPES = 3; + }; + LinkTypes link_types = 3; + enum LinkState{ + RDC_LINK_STATE_NOT_SUPPORTED = 0; + RDC_LINK_STATE_DISABLED = 1; + RDC_LINK_STATE_DOWN = 2; + RDC_LINK_STATE_UP = 3; + }; + repeated LinkState link_states = 4; +} +message LinkStatus{ + uint32 num_of_gpus = 1; + repeated GpuLinkStatus gpus = 2; +} +message GetLinkStatusResponse{ + uint32 status = 1; + LinkStatus linkstatus = 2; +} diff --git a/projects/rdc/rdc_libs/rdc/src/RdcTopologyLinkImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcTopologyLinkImpl.cc index 9c90f48de9..947ac1dacc 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcTopologyLinkImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcTopologyLinkImpl.cc @@ -25,6 +25,7 @@ THE SOFTWARE. #include #include +#include #include #include #include @@ -75,6 +76,7 @@ rdc_status_t RdcTopologyLinkImpl::rdc_device_topology_get(uint32_t gpu_index, // Assign the index to the index list count = device_count.value.l_int; + assert(count <= RDC_MAX_NUM_DEVICES); for (uint32_t i = 0; i < count; i++) { gpu_index_list[i] = i; } @@ -134,6 +136,46 @@ rdc_status_t RdcTopologyLinkImpl::rdc_device_topology_get(uint32_t gpu_index, rdc_status_t RdcTopologyLinkImpl::rdc_link_status_get(rdc_link_status_t* results) { rdc_status_t status = RDC_ST_NOT_FOUND; + amdsmi_status_t err = AMDSMI_STATUS_SUCCESS; + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES]; + + uint32_t count = 0; + rdc_field_value device_count; + status = metric_fetcher_->fetch_smi_field(0, RDC_FI_GPU_COUNT, &device_count); + if (status != RDC_ST_OK) { + return status; + } + + // Assign the index to the index list + count = device_count.value.l_int; + assert(count <= RDC_MAX_NUM_DEVICES); + for (uint32_t i = 0; i < count; i++) { + gpu_index_list[i] = i; + } + results->num_of_gpus = count; + + for (uint32_t i = 0; i < count; i++) { + amdsmi_processor_handle processor_handle; + err = get_processor_handle_from_id(gpu_index_list[i], &processor_handle); + if (err != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_INFO, "Fail to get process GPUs processor handle information: " << err); + return status; + } + + amdsmi_xgmi_link_status_t link_status; + err = amdsmi_get_gpu_xgmi_link_status(processor_handle, &link_status); + if (err != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_INFO, "Fail to get process GPUs xgmi link information: " << err); + } + results->gpus[i].gpu_index = gpu_index_list[i]; + results->gpus[i].num_of_links = link_status.total_links; + for (uint32_t n = 0; n < link_status.total_links; n++) { + results->gpus[i].link_states[n] = static_cast(link_status.status[n]); + } + + results->gpus[i].link_types = RDC_IOLINK_TYPE_XGMI; + } + return status; } diff --git a/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc b/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc index e587f717ad..1dde77dd7f 100644 --- a/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc +++ b/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc @@ -1027,27 +1027,46 @@ rdc_status_t RdcStandaloneHandler::rdc_device_topology_get(uint32_t gpu_index, if (err_status != RDC_ST_OK) return err_status; ::rdc::Topology Topology = reply.toppology(); - results->num_of_gpus= Topology.num_of_gpus(); - results->numa_node= Topology.numa_node(); + results->num_of_gpus = Topology.num_of_gpus(); + results->numa_node = Topology.numa_node(); for (uint32_t i = 0; i < Topology.num_of_gpus(); ++i) { - ::rdc::TopologyLinkInfo linkinfo = Topology.link_infos(i); - results->link_infos[i].gpu_index=linkinfo.gpu_index(); - results->link_infos[i].weight=linkinfo.weight(); - results->link_infos[i].min_bandwidth=linkinfo.min_bandwidth(); - results->link_infos[i].max_bandwidth=linkinfo.max_bandwidth(); - results->link_infos[i].hops=linkinfo.hops(); - results->link_infos[i].link_type=static_cast(linkinfo.link_type()); - results->link_infos[i].is_p2p_accessible=linkinfo.p2p_accessible(); + ::rdc::TopologyLinkInfo linkinfo = Topology.link_infos(i); + results->link_infos[i].gpu_index = linkinfo.gpu_index(); + results->link_infos[i].weight = linkinfo.weight(); + results->link_infos[i].min_bandwidth = linkinfo.min_bandwidth(); + results->link_infos[i].max_bandwidth = linkinfo.max_bandwidth(); + results->link_infos[i].hops = linkinfo.hops(); + results->link_infos[i].link_type = static_cast(linkinfo.link_type()); + results->link_infos[i].is_p2p_accessible = linkinfo.p2p_accessible(); } return RDC_ST_OK; } rdc_status_t RdcStandaloneHandler::rdc_link_status_get(rdc_link_status_t* results) { - ::rdc::UpdateAllFieldsResponse reply; - ::grpc::Status status = grpc::Status::OK; - return error_handle(status, reply.status()); + ::rdc::Empty request; + ::rdc::GetLinkStatusResponse reply; + ::grpc::ClientContext context; + + ::grpc::Status status = stub_->GetLinkStatus(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + if (err_status != RDC_ST_OK) return err_status; + + ::rdc::LinkStatus LinkStatus = reply.linkstatus(); + results->num_of_gpus = LinkStatus.num_of_gpus(); + + for (uint32_t i = 0; i < LinkStatus.num_of_gpus(); ++i) { + ::rdc::GpuLinkStatus gpulinkstatus = LinkStatus.gpus(i); + results->gpus[i].gpu_index = gpulinkstatus.gpu_index(); + results->gpus[i].num_of_links = gpulinkstatus.num_of_links(); + results->gpus[i].link_types = static_cast(gpulinkstatus.link_types()); + for (uint32_t n = 0; n < gpulinkstatus.num_of_links(); n++) { + results->gpus[i].link_states[n] = static_cast(gpulinkstatus.link_states(n)); + } + } + + return RDC_ST_OK; } } // namespace rdc diff --git a/projects/rdc/rdci/CMakeLists.txt b/projects/rdc/rdci/CMakeLists.txt index 0888c31eb0..07a7a612cf 100644 --- a/projects/rdc/rdci/CMakeLists.txt +++ b/projects/rdc/rdci/CMakeLists.txt @@ -72,6 +72,7 @@ set(RDCI_SRC_LIST "${SRC_DIR}/RdciConfigSubSystem.cc" "${SRC_DIR}/RdciSubSystem.cc" "${SRC_DIR}/RdciTopologyLinkSubSystem.cc" + "${SRC_DIR}/RdciXgmiLinkStatusSubSystem.cc" "${SRC_DIR}/rdci.cc") message("RDCI_SRC_LIST=${RDCI_SRC_LIST}") set(RDCI_EXE "rdci") diff --git a/projects/rdc/rdci/include/RdciTopologyLinkSubSystem.h b/projects/rdc/rdci/include/RdciTopologyLinkSubSystem.h index b36330ae7d..0af6c088f7 100644 --- a/projects/rdc/rdci/include/RdciTopologyLinkSubSystem.h +++ b/projects/rdc/rdci/include/RdciTopologyLinkSubSystem.h @@ -34,7 +34,7 @@ class RdciTopologyLinkSubSystem : public RdciSubSystem { private: void show_help() const; enum OPERATIONS { - POLICY_UNKNOWN = 0, + TOPOLOGY_UNKNOWN = 0, TOPOLOGY_INDEX, } topology_ops_; uint32_t group_index_; diff --git a/projects/rdc/rdci/include/RdciXgmiLinkStatusSubSystem.h b/projects/rdc/rdci/include/RdciXgmiLinkStatusSubSystem.h new file mode 100644 index 0000000000..0b79d5de1b --- /dev/null +++ b/projects/rdc/rdci/include/RdciXgmiLinkStatusSubSystem.h @@ -0,0 +1,43 @@ +/* +Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef RDCI_INCLUDE_RDCIXMGILINKSTATUSSYSTEM_H_ +#define RDCI_INCLUDE_RDCIXMGILINKSTATUSSYSTEM_H_ +#include + +#include + +#include "RdciSubSystem.h" +namespace amd { +namespace rdc { +class RdciXgmiLinkStatusSubSystem : public RdciSubSystem { + public: + RdciXgmiLinkStatusSubSystem(); + void parse_cmd_opts(int argc, char** argv) override; + void process() override; + + private: + void show_help() const; + enum OPERATIONS { + XMGI_LINK_UNKNOWN = 0, + XMGI_LINK_STATUS, + } link_status_ops_; +}; +} // namespace rdc +} // namespace amd +#endif // RDCI_INCLUDE_RDCIXMGILINKSTATUSSYSTEM_H_ \ No newline at end of file diff --git a/projects/rdc/rdci/src/RdciTopologyLinkSubSystem.cc b/projects/rdc/rdci/src/RdciTopologyLinkSubSystem.cc index 5a5c7057aa..fa00d2081a 100644 --- a/projects/rdc/rdci/src/RdciTopologyLinkSubSystem.cc +++ b/projects/rdc/rdci/src/RdciTopologyLinkSubSystem.cc @@ -28,7 +28,7 @@ THE SOFTWARE. #include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { -RdciTopologyLinkSubSystem::RdciTopologyLinkSubSystem() {} +RdciTopologyLinkSubSystem::RdciTopologyLinkSubSystem() : topology_ops_(TOPOLOGY_UNKNOWN) {} void RdciTopologyLinkSubSystem::parse_cmd_opts(int argc, char** argv) { const int HOST_OPTIONS = 1000; @@ -109,8 +109,7 @@ void RdciTopologyLinkSubSystem::process() { << "------------------+\n"; for (uint32_t i = 0; i < topology.num_of_gpus; i++) { std::cout << "| To GPU " << i + 1 << "\t\t" - << "| " << topology_link_type_to_str(RDC_IOLINK_TYPE_XGMI) - << "\t\t\t|\n"; + << "| " << topology_link_type_to_str(RDC_IOLINK_TYPE_XGMI) << "\t\t\t|\n"; } std::cout << "+-----------------------+" << "-----------------------------" @@ -118,6 +117,9 @@ void RdciTopologyLinkSubSystem::process() { } break; } + default: + throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command"); + break; } } } // namespace rdc diff --git a/projects/rdc/rdci/src/RdciXgmiLinkStatusSubSystem.cc b/projects/rdc/rdci/src/RdciXgmiLinkStatusSubSystem.cc new file mode 100644 index 0000000000..f6f83eea15 --- /dev/null +++ b/projects/rdc/rdci/src/RdciXgmiLinkStatusSubSystem.cc @@ -0,0 +1,104 @@ +/* +Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "RdciXgmiLinkStatusSubSystem.h" + +#include +#include +#include + +#include "common/rdc_utils.h" +#include "rdc/rdc.h" +#include "rdc_lib/RdcException.h" +#include "rdc_lib/rdc_common.h" +namespace amd { +namespace rdc { +RdciXgmiLinkStatusSubSystem::RdciXgmiLinkStatusSubSystem() : link_status_ops_(XMGI_LINK_UNKNOWN) {} + +void RdciXgmiLinkStatusSubSystem::parse_cmd_opts(int argc, char** argv) { + const int HOST_OPTIONS = 1000; + const struct option long_options[] = {{"host", required_argument, nullptr, HOST_OPTIONS}, + {"help", optional_argument, nullptr, 'h'}, + {"unauth", optional_argument, nullptr, 'u'}, + {"xgmi-link-status", optional_argument, nullptr, 'l'}, + {nullptr, 0, nullptr, 0}}; + int option_index = 0; + int opt = 0; + while ((opt = getopt_long(argc, argv, "hul", long_options, &option_index)) != -1) { + { + switch (opt) { + case HOST_OPTIONS: + ip_port_ = optarg; + break; + case 'h': + show_help(); + break; + case 'u': + use_auth_ = false; + break; + case 'l': + link_status_ops_ = XMGI_LINK_STATUS; + break; + default: + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command line options"); + } + } + } +} +void RdciXgmiLinkStatusSubSystem::show_help() const { + std::cout << " link -- Used to link Get the link status the link is up or down.\n\n"; + std::cout << "Usage\n"; + std::cout << " rdci link [--host :port] [--xgmi-link-status] -l \n"; + std::cout << "\nFlags:\n"; + show_common_usage(); +} + +static const char* xgmi_link_status_to_str(rdc_link_state_t state) { + if (state == RDC_LINK_STATE_DISABLED) return "X "; + if (state == RDC_LINK_STATE_DOWN) return "D "; + if (state == RDC_LINK_STATE_UP) return "U "; + return "N/A "; +} + +void RdciXgmiLinkStatusSubSystem::process() { + rdc_status_t result = RDC_ST_OK; + switch (link_status_ops_) { + case XMGI_LINK_STATUS: { + rdc_link_status_t link_status; + result = rdc_link_status_get(rdc_handle_, &link_status); + if (result == RDC_ST_OK) { + std::cout << "GPUs:\n"; + for (int32_t i = 0; i < link_status.num_of_gpus; i++) { + std::cout << " GPU:" << link_status.gpus[i].gpu_index << "\n"; + std::cout << " "; + for (uint32_t n = 0; n < link_status.gpus[i].num_of_links; n++) { + std::cout << xgmi_link_status_to_str(link_status.gpus[i].link_states[n]); + } + std::cout << "\n"; + } + } + break; + } + default: + throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command"); + break; + } +} +} // namespace rdc +} // namespace amd \ No newline at end of file diff --git a/projects/rdc/rdci/src/rdci.cc b/projects/rdc/rdci/src/rdci.cc index f4c5b0c59a..e8294b4b63 100644 --- a/projects/rdc/rdci/src/rdci.cc +++ b/projects/rdc/rdci/src/rdci.cc @@ -35,6 +35,7 @@ THE SOFTWARE. #include "RdciTopologyLinkSubSystem.h" #include "RdciPolicySubSystem.h" #include "RdciStatsSubSystem.h" +#include "RdciXgmiLinkStatusSubSystem.h" #include "rdc/rdc.h" #include "rdc_lib/RdcException.h" #include "rdc_lib/rdc_common.h" @@ -57,7 +58,7 @@ int main(int argc, char** argv) { const std::string usage_help = "Usage:\trdci |\n" "subsystem: \n" - " discovery, dmon, group, fieldgroup, stats, diag, config, policy, health, topo\n" + " discovery, dmon, group, fieldgroup, stats, diag, config, policy, health, topo, link\n" "options: \n" " -v(--version) : Print client version information only\n"; @@ -93,6 +94,8 @@ int main(int argc, char** argv) { subsystem.reset(new amd::rdc::RdciHealthSubSystem()); } else if (subsystem_name == "topo") { subsystem.reset(new amd::rdc::RdciTopologyLinkSubSystem()); + } else if (subsystem_name == "link") { + subsystem.reset(new amd::rdc::RdciXgmiLinkStatusSubSystem()); } else if (subsystem_name == "stats") { subsystem.reset(new amd::rdc::RdciStatsSubSystem()); } else if (subsystem_name == "policy") { diff --git a/projects/rdc/server/include/rdc/rdc_api_service.h b/projects/rdc/server/include/rdc/rdc_api_service.h index afafcfe2b4..974c9c3917 100644 --- a/projects/rdc/server/include/rdc/rdc_api_service.h +++ b/projects/rdc/server/include/rdc/rdc_api_service.h @@ -152,11 +152,14 @@ class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service { ::grpc::Status UnRegisterPolicy(::grpc::ServerContext* context, const ::rdc::UnRegisterPolicyRequest* request, ::rdc::UnRegisterPolicyResponse* reply) override; - + ::grpc::Status GetTopology(::grpc::ServerContext* context, const ::rdc::GetTopologyRequest* request, ::rdc::GetTopologyResponse* reply) override; + ::grpc::Status GetLinkStatus(::grpc::ServerContext* context, const ::rdc::Empty* request, + ::rdc::GetLinkStatusResponse* reply) override; + ::grpc::Status SetHealth(::grpc::ServerContext* context, const ::rdc::SetHealthRequest* request, ::rdc::SetHealthResponse* reply) override; diff --git a/projects/rdc/server/src/rdc_api_service.cc b/projects/rdc/server/src/rdc_api_service.cc index 66bad6d322..edd6659748 100644 --- a/projects/rdc/server/src/rdc_api_service.cc +++ b/projects/rdc/server/src/rdc_api_service.cc @@ -1106,5 +1106,38 @@ int RdcAPIServiceImpl::PolicyCallback(rdc_policy_callback_response_t* userData) return ::grpc::Status::OK; } +::grpc::Status RdcAPIServiceImpl::GetLinkStatus(::grpc::ServerContext* context, + const ::rdc::Empty* request, + ::rdc::GetLinkStatusResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } + + rdc_link_status_t link_status_results; + // call RDC link status API + rdc_status_t result = rdc_link_status_get(rdc_handle_, &link_status_results); + reply->set_status(result); + if (result != RDC_ST_OK) { + return ::grpc::Status::OK; + } + + ::rdc::LinkStatus* linkstatus = reply->mutable_linkstatus(); + linkstatus->set_num_of_gpus(link_status_results.num_of_gpus); + for (int32_t i = 0; i < link_status_results.num_of_gpus; ++i) { + ::rdc::GpuLinkStatus* gpulinkstatus = linkstatus->add_gpus(); + gpulinkstatus->set_gpu_index(link_status_results.gpus[i].gpu_index); + gpulinkstatus->set_num_of_links(link_status_results.gpus[i].num_of_links); + gpulinkstatus->set_link_types( + static_cast<::rdc::GpuLinkStatus_LinkTypes>(link_status_results.gpus[i].link_types)); + for (uint32_t n = 0; n < link_status_results.gpus[i].num_of_links; n++) { + gpulinkstatus->add_link_states(static_cast<::rdc::GpuLinkStatus_LinkState>( + link_status_results.gpus[i].link_states[n])); + } + } + + return ::grpc::Status::OK; +} + } // namespace rdc } // namespace amd