Enable RDC link Status feature

1.add link status APIs
   2.Add link status example for link status API usage


[ROCm/rdc commit: 29b6699b62]
This commit is contained in:
stali
2024-12-17 14:29:05 +08:00
کامیت شده توسط Li, Star
والد 725599b51c
کامیت 52bb0d6466
15فایلهای تغییر یافته به همراه315 افزوده شده و 22 حذف شده
@@ -132,6 +132,12 @@ set(CONFIG_EXAMPLE_EXE "config")
add_executable(${CONFIG_EXAMPLE_EXE} "${CONFIG_EXAMPLE_SRC_LIST}")
target_link_libraries(${CONFIG_EXAMPLE_EXE} pthread dl rdc_bootstrap)
set(TOPOLOGYLINK_EXAMPLE_SRC_LIST "topologylink_example.cc")
cmake_print_variables(TOPOLOGYLINK_EXAMPLE_SRC_LIST)
set(TOPOLOGYLINK_EXAMPLE_EXE "topologylink")
add_executable(${TOPOLOGYLINK_EXAMPLE_EXE} "${TOPOLOGYLINK_EXAMPLE_SRC_LIST}")
target_link_libraries(${TOPOLOGYLINK_EXAMPLE_EXE} pthread dl rdc_bootstrap)
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
message(" Finished Cmake Example ")
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
@@ -119,10 +119,18 @@ int main() {
<< "max_bandwidth: " << std::to_string(topo.link_infos[i].max_bandwidth) << "\n"
<< "hops: " << std::to_string(topo.link_infos[i].hops) << "\n"
<< "link_type: " << topology_link_type_to_str(topo.link_infos[i].link_type) << "\n"
<< "is_p2p_accessible: " << std::to_string(topo.link_infos[i].is_p2p_accessible) << "\n"
<< "is_p2p_accessible: " << std::to_string(topo.link_infos[i].is_p2p_accessible)
<< "\n"
<< std::endl;
}
rdc_link_status_t link_status;
result = rdc_link_status_get(rdc_handle, &link_status);
if (result != RDC_ST_OK) {
std::cout << "Error clear topology, Return: " << rdc_status_string(result) << std::endl;
goto cleanup;
}
//... clean up
cleanup:
std::cout << "Cleaning up.\n";
@@ -654,7 +654,7 @@ typedef enum {
#define RDC_MAX_NUM_OF_LINKS 16
typedef struct {
int32_t gpu_index;
uint32_t gpu_index;
uint32_t num_of_links; // The size of the array link_states
rdc_topology_link_type_t link_types; // XGMI, PCIe, and so on
rdc_link_state_t link_states[RDC_MAX_NUM_OF_LINKS];
@@ -147,9 +147,10 @@ class RdcEmbeddedHandler final : public RdcHandler {
RdcWatchTablePtr watch_table_;
RdcMetricsUpdaterPtr metrics_updater_;
RdcPolicyPtr policy_;
RdcTopologyLinkPtr topologylink_;
RdcConfigSettingsPtr config_handler_;
std::future<void> updater_;
RdcTopologyLinkPtr topologylink_;
};
} // namespace rdc
@@ -217,6 +217,8 @@ service RdcAPI {
//Clear the setting
rpc ClearConfig(ClearConfigRequest) returns (ClearConfigResponse) {}
// rdc_status_t GetLinkStatus()
rpc GetLinkStatus(Empty) returns (GetLinkStatusResponse) {}
}
message Empty {
@@ -729,6 +731,32 @@ message GetTopologyResponse {
uint32 status = 1;
Topology toppology = 2;
}
message GpuLinkStatus{
uint32 gpu_index = 1;
uint32 num_of_links = 2;
enum LinkTypes {
RDC_IOLINK_TYPE_UNDEFINED = 0;
RDC_IOLINK_TYPE_PCIEXPRESS = 1;
RDC_IOLINK_TYPE_XGMI = 2;
RDCI_IOLINK_TYPE_NUMIOLINKTYPES = 3;
};
LinkTypes link_types = 3;
enum LinkState{
RDC_LINK_STATE_NOT_SUPPORTED = 0;
RDC_LINK_STATE_DISABLED = 1;
RDC_LINK_STATE_DOWN = 2;
RDC_LINK_STATE_UP = 3;
};
repeated LinkState link_states = 4;
}
message LinkStatus{
uint32 num_of_gpus = 1;
repeated GpuLinkStatus gpus = 2;
}
message GetLinkStatusResponse{
uint32 status = 1;
LinkStatus linkstatus = 2;
}
@@ -25,6 +25,7 @@ THE SOFTWARE.
#include <sys/time.h>
#include <unistd.h>
#include <assert.h>
#include <algorithm>
#include <ctime>
#include <map>
@@ -75,6 +76,7 @@ rdc_status_t RdcTopologyLinkImpl::rdc_device_topology_get(uint32_t gpu_index,
// Assign the index to the index list
count = device_count.value.l_int;
assert(count <= RDC_MAX_NUM_DEVICES);
for (uint32_t i = 0; i < count; i++) {
gpu_index_list[i] = i;
}
@@ -134,6 +136,46 @@ rdc_status_t RdcTopologyLinkImpl::rdc_device_topology_get(uint32_t gpu_index,
rdc_status_t RdcTopologyLinkImpl::rdc_link_status_get(rdc_link_status_t* results) {
rdc_status_t status = RDC_ST_NOT_FOUND;
amdsmi_status_t err = AMDSMI_STATUS_SUCCESS;
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES];
uint32_t count = 0;
rdc_field_value device_count;
status = metric_fetcher_->fetch_smi_field(0, RDC_FI_GPU_COUNT, &device_count);
if (status != RDC_ST_OK) {
return status;
}
// Assign the index to the index list
count = device_count.value.l_int;
assert(count <= RDC_MAX_NUM_DEVICES);
for (uint32_t i = 0; i < count; i++) {
gpu_index_list[i] = i;
}
results->num_of_gpus = count;
for (uint32_t i = 0; i < count; i++) {
amdsmi_processor_handle processor_handle;
err = get_processor_handle_from_id(gpu_index_list[i], &processor_handle);
if (err != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_INFO, "Fail to get process GPUs processor handle information: " << err);
return status;
}
amdsmi_xgmi_link_status_t link_status;
err = amdsmi_get_gpu_xgmi_link_status(processor_handle, &link_status);
if (err != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_INFO, "Fail to get process GPUs xgmi link information: " << err);
}
results->gpus[i].gpu_index = gpu_index_list[i];
results->gpus[i].num_of_links = link_status.total_links;
for (uint32_t n = 0; n < link_status.total_links; n++) {
results->gpus[i].link_states[n] = static_cast<rdc_link_state_t>(link_status.status[n]);
}
results->gpus[i].link_types = RDC_IOLINK_TYPE_XGMI;
}
return status;
}
@@ -1027,27 +1027,46 @@ rdc_status_t RdcStandaloneHandler::rdc_device_topology_get(uint32_t gpu_index,
if (err_status != RDC_ST_OK) return err_status;
::rdc::Topology Topology = reply.toppology();
results->num_of_gpus= Topology.num_of_gpus();
results->numa_node= Topology.numa_node();
results->num_of_gpus = Topology.num_of_gpus();
results->numa_node = Topology.numa_node();
for (uint32_t i = 0; i < Topology.num_of_gpus(); ++i) {
::rdc::TopologyLinkInfo linkinfo = Topology.link_infos(i);
results->link_infos[i].gpu_index=linkinfo.gpu_index();
results->link_infos[i].weight=linkinfo.weight();
results->link_infos[i].min_bandwidth=linkinfo.min_bandwidth();
results->link_infos[i].max_bandwidth=linkinfo.max_bandwidth();
results->link_infos[i].hops=linkinfo.hops();
results->link_infos[i].link_type=static_cast<rdc_topology_link_type_t>(linkinfo.link_type());
results->link_infos[i].is_p2p_accessible=linkinfo.p2p_accessible();
::rdc::TopologyLinkInfo linkinfo = Topology.link_infos(i);
results->link_infos[i].gpu_index = linkinfo.gpu_index();
results->link_infos[i].weight = linkinfo.weight();
results->link_infos[i].min_bandwidth = linkinfo.min_bandwidth();
results->link_infos[i].max_bandwidth = linkinfo.max_bandwidth();
results->link_infos[i].hops = linkinfo.hops();
results->link_infos[i].link_type = static_cast<rdc_topology_link_type_t>(linkinfo.link_type());
results->link_infos[i].is_p2p_accessible = linkinfo.p2p_accessible();
}
return RDC_ST_OK;
}
rdc_status_t RdcStandaloneHandler::rdc_link_status_get(rdc_link_status_t* results) {
::rdc::UpdateAllFieldsResponse reply;
::grpc::Status status = grpc::Status::OK;
return error_handle(status, reply.status());
::rdc::Empty request;
::rdc::GetLinkStatusResponse reply;
::grpc::ClientContext context;
::grpc::Status status = stub_->GetLinkStatus(&context, request, &reply);
rdc_status_t err_status = error_handle(status, reply.status());
if (err_status != RDC_ST_OK) return err_status;
::rdc::LinkStatus LinkStatus = reply.linkstatus();
results->num_of_gpus = LinkStatus.num_of_gpus();
for (uint32_t i = 0; i < LinkStatus.num_of_gpus(); ++i) {
::rdc::GpuLinkStatus gpulinkstatus = LinkStatus.gpus(i);
results->gpus[i].gpu_index = gpulinkstatus.gpu_index();
results->gpus[i].num_of_links = gpulinkstatus.num_of_links();
results->gpus[i].link_types = static_cast<rdc_topology_link_type_t>(gpulinkstatus.link_types());
for (uint32_t n = 0; n < gpulinkstatus.num_of_links(); n++) {
results->gpus[i].link_states[n] = static_cast<rdc_link_state_t>(gpulinkstatus.link_states(n));
}
}
return RDC_ST_OK;
}
} // namespace rdc
@@ -72,6 +72,7 @@ set(RDCI_SRC_LIST
"${SRC_DIR}/RdciConfigSubSystem.cc"
"${SRC_DIR}/RdciSubSystem.cc"
"${SRC_DIR}/RdciTopologyLinkSubSystem.cc"
"${SRC_DIR}/RdciXgmiLinkStatusSubSystem.cc"
"${SRC_DIR}/rdci.cc")
message("RDCI_SRC_LIST=${RDCI_SRC_LIST}")
set(RDCI_EXE "rdci")
@@ -34,7 +34,7 @@ class RdciTopologyLinkSubSystem : public RdciSubSystem {
private:
void show_help() const;
enum OPERATIONS {
POLICY_UNKNOWN = 0,
TOPOLOGY_UNKNOWN = 0,
TOPOLOGY_INDEX,
} topology_ops_;
uint32_t group_index_;
@@ -0,0 +1,43 @@
/*
Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef RDCI_INCLUDE_RDCIXMGILINKSTATUSSYSTEM_H_
#define RDCI_INCLUDE_RDCIXMGILINKSTATUSSYSTEM_H_
#include <signal.h>
#include <string>
#include "RdciSubSystem.h"
namespace amd {
namespace rdc {
class RdciXgmiLinkStatusSubSystem : public RdciSubSystem {
public:
RdciXgmiLinkStatusSubSystem();
void parse_cmd_opts(int argc, char** argv) override;
void process() override;
private:
void show_help() const;
enum OPERATIONS {
XMGI_LINK_UNKNOWN = 0,
XMGI_LINK_STATUS,
} link_status_ops_;
};
} // namespace rdc
} // namespace amd
#endif // RDCI_INCLUDE_RDCIXMGILINKSTATUSSYSTEM_H_
@@ -28,7 +28,7 @@ THE SOFTWARE.
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
RdciTopologyLinkSubSystem::RdciTopologyLinkSubSystem() {}
RdciTopologyLinkSubSystem::RdciTopologyLinkSubSystem() : topology_ops_(TOPOLOGY_UNKNOWN) {}
void RdciTopologyLinkSubSystem::parse_cmd_opts(int argc, char** argv) {
const int HOST_OPTIONS = 1000;
@@ -109,8 +109,7 @@ void RdciTopologyLinkSubSystem::process() {
<< "------------------+\n";
for (uint32_t i = 0; i < topology.num_of_gpus; i++) {
std::cout << "| To GPU " << i + 1 << "\t\t"
<< "| " << topology_link_type_to_str(RDC_IOLINK_TYPE_XGMI)
<< "\t\t\t|\n";
<< "| " << topology_link_type_to_str(RDC_IOLINK_TYPE_XGMI) << "\t\t\t|\n";
}
std::cout << "+-----------------------+"
<< "-----------------------------"
@@ -118,6 +117,9 @@ void RdciTopologyLinkSubSystem::process() {
}
break;
}
default:
throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command");
break;
}
}
} // namespace rdc
@@ -0,0 +1,104 @@
/*
Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "RdciXgmiLinkStatusSubSystem.h"
#include <getopt.h>
#include <signal.h>
#include <unistd.h>
#include "common/rdc_utils.h"
#include "rdc/rdc.h"
#include "rdc_lib/RdcException.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
RdciXgmiLinkStatusSubSystem::RdciXgmiLinkStatusSubSystem() : link_status_ops_(XMGI_LINK_UNKNOWN) {}
void RdciXgmiLinkStatusSubSystem::parse_cmd_opts(int argc, char** argv) {
const int HOST_OPTIONS = 1000;
const struct option long_options[] = {{"host", required_argument, nullptr, HOST_OPTIONS},
{"help", optional_argument, nullptr, 'h'},
{"unauth", optional_argument, nullptr, 'u'},
{"xgmi-link-status", optional_argument, nullptr, 'l'},
{nullptr, 0, nullptr, 0}};
int option_index = 0;
int opt = 0;
while ((opt = getopt_long(argc, argv, "hul", long_options, &option_index)) != -1) {
{
switch (opt) {
case HOST_OPTIONS:
ip_port_ = optarg;
break;
case 'h':
show_help();
break;
case 'u':
use_auth_ = false;
break;
case 'l':
link_status_ops_ = XMGI_LINK_STATUS;
break;
default:
show_help();
throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command line options");
}
}
}
}
void RdciXgmiLinkStatusSubSystem::show_help() const {
std::cout << " link -- Used to link Get the link status the link is up or down.\n\n";
std::cout << "Usage\n";
std::cout << " rdci link [--host <IP/FQDN>:port] [--xgmi-link-status] -l \n";
std::cout << "\nFlags:\n";
show_common_usage();
}
static const char* xgmi_link_status_to_str(rdc_link_state_t state) {
if (state == RDC_LINK_STATE_DISABLED) return "X ";
if (state == RDC_LINK_STATE_DOWN) return "D ";
if (state == RDC_LINK_STATE_UP) return "U ";
return "N/A ";
}
void RdciXgmiLinkStatusSubSystem::process() {
rdc_status_t result = RDC_ST_OK;
switch (link_status_ops_) {
case XMGI_LINK_STATUS: {
rdc_link_status_t link_status;
result = rdc_link_status_get(rdc_handle_, &link_status);
if (result == RDC_ST_OK) {
std::cout << "GPUs:\n";
for (int32_t i = 0; i < link_status.num_of_gpus; i++) {
std::cout << " GPU:" << link_status.gpus[i].gpu_index << "\n";
std::cout << " ";
for (uint32_t n = 0; n < link_status.gpus[i].num_of_links; n++) {
std::cout << xgmi_link_status_to_str(link_status.gpus[i].link_states[n]);
}
std::cout << "\n";
}
}
break;
}
default:
throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command");
break;
}
}
} // namespace rdc
} // namespace amd
@@ -35,6 +35,7 @@ THE SOFTWARE.
#include "RdciTopologyLinkSubSystem.h"
#include "RdciPolicySubSystem.h"
#include "RdciStatsSubSystem.h"
#include "RdciXgmiLinkStatusSubSystem.h"
#include "rdc/rdc.h"
#include "rdc_lib/RdcException.h"
#include "rdc_lib/rdc_common.h"
@@ -57,7 +58,7 @@ int main(int argc, char** argv) {
const std::string usage_help =
"Usage:\trdci <subsystem>|<options>\n"
"subsystem: \n"
" discovery, dmon, group, fieldgroup, stats, diag, config, policy, health, topo\n"
" discovery, dmon, group, fieldgroup, stats, diag, config, policy, health, topo, link\n"
"options: \n"
" -v(--version) : Print client version information only\n";
@@ -93,6 +94,8 @@ int main(int argc, char** argv) {
subsystem.reset(new amd::rdc::RdciHealthSubSystem());
} else if (subsystem_name == "topo") {
subsystem.reset(new amd::rdc::RdciTopologyLinkSubSystem());
} else if (subsystem_name == "link") {
subsystem.reset(new amd::rdc::RdciXgmiLinkStatusSubSystem());
} else if (subsystem_name == "stats") {
subsystem.reset(new amd::rdc::RdciStatsSubSystem());
} else if (subsystem_name == "policy") {
@@ -152,11 +152,14 @@ class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service {
::grpc::Status UnRegisterPolicy(::grpc::ServerContext* context,
const ::rdc::UnRegisterPolicyRequest* request,
::rdc::UnRegisterPolicyResponse* reply) override;
::grpc::Status GetTopology(::grpc::ServerContext* context,
const ::rdc::GetTopologyRequest* request,
::rdc::GetTopologyResponse* reply) override;
::grpc::Status GetLinkStatus(::grpc::ServerContext* context, const ::rdc::Empty* request,
::rdc::GetLinkStatusResponse* reply) override;
::grpc::Status SetHealth(::grpc::ServerContext* context, const ::rdc::SetHealthRequest* request,
::rdc::SetHealthResponse* reply) override;
@@ -1106,5 +1106,38 @@ int RdcAPIServiceImpl::PolicyCallback(rdc_policy_callback_response_t* userData)
return ::grpc::Status::OK;
}
::grpc::Status RdcAPIServiceImpl::GetLinkStatus(::grpc::ServerContext* context,
const ::rdc::Empty* request,
::rdc::GetLinkStatusResponse* reply) {
(void)(context);
if (!reply || !request) {
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
}
rdc_link_status_t link_status_results;
// call RDC link status API
rdc_status_t result = rdc_link_status_get(rdc_handle_, &link_status_results);
reply->set_status(result);
if (result != RDC_ST_OK) {
return ::grpc::Status::OK;
}
::rdc::LinkStatus* linkstatus = reply->mutable_linkstatus();
linkstatus->set_num_of_gpus(link_status_results.num_of_gpus);
for (int32_t i = 0; i < link_status_results.num_of_gpus; ++i) {
::rdc::GpuLinkStatus* gpulinkstatus = linkstatus->add_gpus();
gpulinkstatus->set_gpu_index(link_status_results.gpus[i].gpu_index);
gpulinkstatus->set_num_of_links(link_status_results.gpus[i].num_of_links);
gpulinkstatus->set_link_types(
static_cast<::rdc::GpuLinkStatus_LinkTypes>(link_status_results.gpus[i].link_types));
for (uint32_t n = 0; n < link_status_results.gpus[i].num_of_links; n++) {
gpulinkstatus->add_link_states(static_cast<::rdc::GpuLinkStatus_LinkState>(
link_status_results.gpus[i].link_states[n]));
}
}
return ::grpc::Status::OK;
}
} // namespace rdc
} // namespace amd