From 3bdca8b8b67890d973b01ddfd1b8bf253d17431a Mon Sep 17 00:00:00 2001 From: "Yuan, Perry" Date: Mon, 31 Mar 2025 10:58:36 +0800 Subject: [PATCH] Implement CPU discovery support (#77) * Implement CPU discovery support SWDEV-482949: enable the CPU model name info support to the RDC, rdci command can detect GPU and CPU modules at the same time. It will query the CPU info through the amdsmi interface like below: 1 GPUs found. ----------------------------------------------------------------- GPU Index Device Information 0 AMD Radeon PRO W7800 ================================================================= 1 CPUs found. ----------------------------------------------------------------- CPU Index Device Information 0 AMD Ryzen Threadripper PRO 7995WX 96-Cores ----------------------------------------------------------------- Change-Id: Ibc6533c9a61000cd86c45b1bae14c3eb6788c119 Signed-off-by: Perry Yuan * CMAKE - Add required version for amdsmi Change-Id: I341a89351d196ec66cce215a5d1d3953302fcc66 Signed-off-by: Galantsev, Dmitrii --------- Signed-off-by: Perry Yuan Signed-off-by: Galantsev, Dmitrii Co-authored-by: Galantsev, Dmitrii --- CMakeLists.txt | 7 ++- common/rdc_field.data | 13 ++++ include/rdc/rdc.h | 51 ++++++++++++++++ include/rdc_lib/RdcHandler.h | 4 ++ include/rdc_lib/RdcMetricFetcher.h | 3 +- include/rdc_lib/impl/RdcEmbeddedHandler.h | 5 ++ include/rdc_lib/impl/RdcMetricFetcherImpl.h | 2 + include/rdc_lib/impl/RdcStandaloneHandler.h | 4 ++ protos/rdc.proto | 21 +++++++ rdc_libs/bootstrap/src/RdcBootStrap.cc | 23 ++++++- rdc_libs/rdc/src/RdcEmbeddedHandler.cc | 41 ++++++++++++- rdc_libs/rdc/src/RdcMetricFetcherImpl.cc | 61 +++++++++++++++++++ .../rdc_client/src/RdcStandaloneHandler.cc | 45 ++++++++++++++ rdci/src/RdciDiscoverySubSystem.cc | 42 +++++++++++++ server/include/rdc/rdc_api_service.h | 7 +++ server/src/rdc_api_service.cc | 41 +++++++++++++ 16 files changed, 365 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 31a90f7ef2..4bfb5a4e72 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -85,6 +85,11 @@ option(BUILD_EXAMPLES "Build examples" OFF) # Enable shared libraries for gtest option(BUILD_SHARED_LIBS "Build shared library (.so) or not." ON) +option(ENABLE_ESMI_LIB "Enable AMDSMI ESMI Library" ON) +if(ENABLE_ESMI_LIB) + add_definitions("-DENABLE_ESMI_LIB=1") +endif() + # Enable address sanitizer set(ADDRESS_SANITIZER_DEFAULT OFF) if(DEFINED ENV{ADDRESS_SANITIZER}) @@ -165,7 +170,7 @@ if(BUILD_STANDALONE AND GRPC_ROOT STREQUAL GRPC_ROOT_DEFAULT) Continuing without gRPC install") endif() -find_package(amd_smi +find_package(amd_smi 25.4.0 NAMES amd_smi HINTS ${ROCM_DIR}/lib/cmake CONFIGURE REQUIRED) diff --git a/common/rdc_field.data b/common/rdc_field.data index f4eecac54a..f270df24b1 100644 --- a/common/rdc_field.data +++ b/common/rdc_field.data @@ -217,3 +217,16 @@ FLD_DESC_ENT(RDC_HEALTH_RETIRED_PAGE_LIMIT, "Retired page limit", FLD_DESC_ENT(RDC_HEALTH_EEPROM_CONFIG_VALID, "Verify checksum of EEPROM", "EEPROM_CONFIG_VALID", true) FLD_DESC_ENT(RDC_HEALTH_POWER_THROTTLE_TIME, "Power throttle status counter", "POWER_THROTTLE_TIME", true) FLD_DESC_ENT(RDC_HEALTH_THERMAL_THROTTLE_TIME, "Total time(ms) in thermal throttle status", "THERMAL_THROTTLE_TIME", true) +// CPU-related fields description +FLD_DESC_ENT(RDC_FI_DEV_CPU_UTIL_TOTAL, "CPU total percentage of time in use", "CPU_UTIL_TOTAL", false) +FLD_DESC_ENT(RDC_FI_DEV_CPU_UTIL_USER, "The percentage of time in use by the user", "CPU_UTIL_USER", false) +FLD_DESC_ENT(RDC_FI_DEV_CPU_UTIL_NICE, "The percentage of time in use by low priority (high nice score) programs", "CPU_UTIL_NICE", false) +FLD_DESC_ENT(RDC_FI_DEV_CPU_UTIL_SYS, "The percentage of time in use by the system", "CPU_UTIL_SYS", false) +FLD_DESC_ENT(RDC_FI_DEV_CPU_UTIL_IRQ, "The percentage of time in use by interrupts", "CPU_UTIL_IRQ", false) +FLD_DESC_ENT(RDC_FI_DEV_CPU_TEMP_CURRENT, "Instantaneous temperature (Celsius)", "CPU_TEMP_CURRENT", false) +FLD_DESC_ENT(RDC_FI_DEV_CPU_CLOCK_CURRENT, "Instantaneous clock speed (KHz)", "CPU_CLOCK_CURRENT", false) +FLD_DESC_ENT(RDC_FI_DEV_CPU_POWER_UTIL_CURRENT, "Instantaneous power usage (watts)", "CPU_POWER_UTIL_CURRENT", false) +FLD_DESC_ENT(RDC_FI_DEV_CPU_POWER_LIMIT, "Instantaneous power limit (watts)", "CPU_POWER_LIMIT", false) +FLD_DESC_ENT(RDC_FI_DEV_CPU_VENDOR, "The name of the vendor", "CPU_VENDOR", false) +FLD_DESC_ENT(RDC_FI_DEV_CPU_MODEL, "The name of the model", "CPU_MODEL", false) +FLD_DESC_ENT(RDC_FI_DEV_CPU_COUNT, "The number of CPU cores", "CPU_COUNT", false) diff --git a/include/rdc/rdc.h b/include/rdc/rdc.h index 88441502d2..f9fd81747b 100644 --- a/include/rdc/rdc.h +++ b/include/rdc/rdc.h @@ -410,6 +410,21 @@ typedef enum { RDC_HEALTH_EEPROM_CONFIG_VALID, //!< Reads the EEPROM and verifies the checksums RDC_HEALTH_POWER_THROTTLE_TIME, //!< Power throttle status counter RDC_HEALTH_THERMAL_THROTTLE_TIME, //!< Total time in thermal throttle status (microseconds) + /** + * @brief RDC CPU related fields + */ + RDC_FI_DEV_CPU_UTIL_TOTAL = 10001, //!< CPU total percentage of time in use + RDC_FI_DEV_CPU_UTIL_USER, //!< The percentage of time in use by the user + RDC_FI_DEV_CPU_UTIL_NICE, //!< The percentage of time in use by low priority (high nice score) programs + RDC_FI_DEV_CPU_UTIL_SYS, //!< The percentage of time in use by the system + RDC_FI_DEV_CPU_UTIL_IRQ, //!< The percentage of time in use by interrupts + RDC_FI_DEV_CPU_TEMP_CURRENT, //!< Instantaneous temperature (Celsius) + RDC_FI_DEV_CPU_CLOCK_CURRENT, //!< Instantaneous clock speed (KHz) + RDC_FI_DEV_CPU_POWER_UTIL_CURRENT, //!< Instantaneous power usage (watts) + RDC_FI_DEV_CPU_POWER_LIMIT, //!< Instantaneous power limit (watts) + RDC_FI_DEV_CPU_VENDOR, //!< The name of the vendor + RDC_FI_DEV_CPU_MODEL, //!< The name of the model + RDC_FI_DEV_CPU_COUNT, } rdc_field_t; // even and odd numbers are used for correctable and uncorrectable errors @@ -1041,6 +1056,25 @@ rdc_status_t rdc_field_update_all(rdc_handle_t p_rdc_handle, uint32_t wait_for_u rdc_status_t rdc_device_get_all(rdc_handle_t p_rdc_handle, uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count); +/** + * @brief Get indexes corresponding to all the devices on the system. + * + * @details Indexes represents RDC CPU Id corresponding to each CPU on the + * system and is immutable during the lifespan of the engine. The list + * should be queried again if the engine is restarted. + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[out] cpu_index_list Array reference to fill CPU indexes present on + * the system. + * + * @param[out] count Number of CPUs returned in cpu_index_list. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_device_get_all_cpu(rdc_handle_t p_rdc_handle, + uint32_t cpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count); + /** * @brief Gets device attributes corresponding to the gpu_index. * @@ -1058,6 +1092,23 @@ rdc_status_t rdc_device_get_all(rdc_handle_t p_rdc_handle, rdc_status_t rdc_device_get_attributes(rdc_handle_t p_rdc_handle, uint32_t gpu_index, rdc_device_attributes_t* p_rdc_attr); +/** + * @brief Gets device attributes corresponding to the cpu_index. + * + * @details Fetch the attributes, such as device name, of a CPU. + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] cpu_index CPU index corresponding to which the attributes + * should be fetched + * + * @param[out] p_rdc_attr CPU attribute corresponding to the cpu_index. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_device_get_cpu_attributes(rdc_handle_t p_rdc_handle, uint32_t cpu_index, + rdc_device_attributes_t* p_rdc_attr); + /** * @brief Get version information of components used by rdc. * diff --git a/include/rdc_lib/RdcHandler.h b/include/rdc_lib/RdcHandler.h index 1e391be7b2..708503dd9c 100644 --- a/include/rdc_lib/RdcHandler.h +++ b/include/rdc_lib/RdcHandler.h @@ -43,8 +43,12 @@ class RdcHandler { // Discovery API virtual rdc_status_t rdc_device_get_all(uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) = 0; + virtual rdc_status_t rdc_device_get_all_cpu(uint32_t cpu_index_list[RDC_MAX_NUM_DEVICES], + uint32_t* count) = 0; virtual rdc_status_t rdc_device_get_attributes(uint32_t gpu_index, rdc_device_attributes_t* p_rdc_attr) = 0; + virtual rdc_status_t rdc_device_get_cpu_attributes(uint32_t cpu_index, + rdc_device_attributes_t* p_rdc_attr) = 0; virtual rdc_status_t rdc_device_get_component_version(rdc_component_t component, rdc_component_version_t* p_rdc_compv) = 0; diff --git a/include/rdc_lib/RdcMetricFetcher.h b/include/rdc_lib/RdcMetricFetcher.h index b0e7398449..ca7b54af1b 100644 --- a/include/rdc_lib/RdcMetricFetcher.h +++ b/include/rdc_lib/RdcMetricFetcher.h @@ -39,7 +39,8 @@ class RdcMetricFetcher { virtual rdc_status_t fetch_smi_field(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value) = 0; - + virtual rdc_status_t fetch_smi_cpu_field(uint32_t cpu_index, rdc_field_t field_id, + rdc_field_value* value) = 0; virtual rdc_status_t bulk_fetch_smi_fields( rdc_gpu_field_t* fields, uint32_t fields_count, std::vector& results) = 0; // NOLINT diff --git a/include/rdc_lib/impl/RdcEmbeddedHandler.h b/include/rdc_lib/impl/RdcEmbeddedHandler.h index afdc04b293..1d871672c0 100644 --- a/include/rdc_lib/impl/RdcEmbeddedHandler.h +++ b/include/rdc_lib/impl/RdcEmbeddedHandler.h @@ -53,8 +53,13 @@ class RdcEmbeddedHandler final : public RdcHandler { // Discovery API rdc_status_t rdc_device_get_all(uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) override; + // Discovery API For CPU + rdc_status_t rdc_device_get_all_cpu(uint32_t cpu_index_list[RDC_MAX_NUM_DEVICES], + uint32_t* count) override; rdc_status_t rdc_device_get_attributes(uint32_t gpu_index, rdc_device_attributes_t* p_rdc_attr) override; + rdc_status_t rdc_device_get_cpu_attributes(uint32_t cpu_index, + rdc_device_attributes_t* p_rdc_attr) override; rdc_status_t rdc_device_get_component_version(rdc_component_t component, rdc_component_version_t* p_rdc_compv) override; diff --git a/include/rdc_lib/impl/RdcMetricFetcherImpl.h b/include/rdc_lib/impl/RdcMetricFetcherImpl.h index 63745a66df..fd85d3a6a5 100644 --- a/include/rdc_lib/impl/RdcMetricFetcherImpl.h +++ b/include/rdc_lib/impl/RdcMetricFetcherImpl.h @@ -71,6 +71,8 @@ class RdcMetricFetcherImpl final : public RdcMetricFetcher { public: rdc_status_t fetch_smi_field(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value) override; + rdc_status_t fetch_smi_cpu_field(uint32_t cpu_index, rdc_field_t field_id, + rdc_field_value* value) override; rdc_status_t bulk_fetch_smi_fields( rdc_gpu_field_t* fields, uint32_t fields_count, std::vector& results) override; // NOLINT diff --git a/include/rdc_lib/impl/RdcStandaloneHandler.h b/include/rdc_lib/impl/RdcStandaloneHandler.h index 96d9fdfc5c..c3fd249e65 100644 --- a/include/rdc_lib/impl/RdcStandaloneHandler.h +++ b/include/rdc_lib/impl/RdcStandaloneHandler.h @@ -47,8 +47,12 @@ class RdcStandaloneHandler : public RdcHandler { // Discovery RdcAPI rdc_status_t rdc_device_get_all(uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) override; + rdc_status_t rdc_device_get_all_cpu(uint32_t cpu_index_list[RDC_MAX_NUM_DEVICES], + uint32_t* count) override; rdc_status_t rdc_device_get_attributes(uint32_t gpu_index, rdc_device_attributes_t* p_rdc_attr) override; + rdc_status_t rdc_device_get_cpu_attributes(uint32_t cpu_index, + rdc_device_attributes_t* p_rdc_attr) override; rdc_status_t rdc_device_get_component_version(rdc_component_t component, rdc_component_version_t* p_rdc_compv) override; diff --git a/protos/rdc.proto b/protos/rdc.proto index 94f19bcb3b..f218f267b9 100755 --- a/protos/rdc.proto +++ b/protos/rdc.proto @@ -53,9 +53,16 @@ service RdcAPI { // Discovery API // rdc_status_t rdc_get_all_devices(uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) rpc GetAllDevices(Empty) returns (GetAllDevicesResponse) {} + + // rdc_status_t rdc_get_all_devices(uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) + rpc GetAllCpuDevices(Empty) returns (GetAllCpuDevicesResponse) {} + // rdc_status_t rdc_get_device_attributes(uint32_t gpu_index, rdc_device_attributes_t* p_rdc_attr) rpc GetDeviceAttributes(GetDeviceAttributesRequest) returns (GetDeviceAttributesResponse) {} + // rdc_status_t rdc_get_device_attributes(uint32_t gpu_index, rdc_device_attributes_t* p_rdc_attr) + rpc GetDeviceCpuAttributes(GetCpuDeviceAttributesRequest) returns (GetCpuDeviceAttributesResponse) {} + //rdc_status_t rdc_device_get_component_version(rdc_component_t component, rdc_component_version_t* p_rdc_compv); rpc GetComponentVersion(GetComponentVersionRequest) returns (GetComponentVersionResponse) {} @@ -240,6 +247,15 @@ message GetDeviceAttributesRequest { uint32 gpu_index = 1; } +message GetAllCpuDevicesResponse { + uint32 status = 1; + repeated uint32 cpus = 2; +} + +message GetCpuDeviceAttributesRequest { + uint32 cpu_index = 1; +} + message DeviceAttributes { string device_name = 1; } @@ -249,6 +265,11 @@ message GetDeviceAttributesResponse { DeviceAttributes attributes = 2; } +message GetCpuDeviceAttributesResponse { + uint32 status = 1; + DeviceAttributes attributes = 2; +} + message GetComponentVersionRequest { uint32 component_index = 1; } diff --git a/rdc_libs/bootstrap/src/RdcBootStrap.cc b/rdc_libs/bootstrap/src/RdcBootStrap.cc index e068fb2784..31312d7dcb 100644 --- a/rdc_libs/bootstrap/src/RdcBootStrap.cc +++ b/rdc_libs/bootstrap/src/RdcBootStrap.cc @@ -174,6 +174,16 @@ rdc_status_t rdc_device_get_all(rdc_handle_t p_rdc_handle, ->rdc_device_get_all(gpu_index_list, count); } +rdc_status_t rdc_device_get_all_cpu(rdc_handle_t p_rdc_handle, + uint32_t cpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) { + if (!p_rdc_handle || !count) { + std::cout << "p_rdc_handle or count is NULL\n"; + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle) + ->rdc_device_get_all_cpu(cpu_index_list, count); +} rdc_status_t rdc_device_get_attributes(rdc_handle_t p_rdc_handle, uint32_t gpu_index, rdc_device_attributes_t* p_rdc_attr) { if (!p_rdc_handle || !p_rdc_attr) { @@ -184,8 +194,17 @@ rdc_status_t rdc_device_get_attributes(rdc_handle_t p_rdc_handle, uint32_t gpu_i ->rdc_device_get_attributes(gpu_index, p_rdc_attr); } -rdc_status_t rdc_device_get_component_version(rdc_handle_t p_rdc_handle, rdc_component_t component, - rdc_component_version_t* p_rdc_compv) { +rdc_status_t rdc_device_get_cpu_attributes(rdc_handle_t p_rdc_handle, uint32_t cpu_index, + rdc_device_attributes_t* p_rdc_attr) { + if (!p_rdc_handle || !p_rdc_attr) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle) + ->rdc_device_get_cpu_attributes(cpu_index, p_rdc_attr); +} + +rdc_status_t rdc_device_get_component_version(rdc_handle_t p_rdc_handle, rdc_component_t component, rdc_component_version_t* p_rdc_compv) { if (!p_rdc_handle || !p_rdc_compv) { return RDC_ST_INVALID_HANDLER; } diff --git a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc index c9c9068ac1..048d0ce500 100644 --- a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc +++ b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc @@ -49,7 +49,11 @@ class smi_initializer { smi_initializer() { // Make sure smi will not be initialized multiple times amdsmi_shut_down(); - amdsmi_status_t ret = amdsmi_init(AMDSMI_INIT_AMD_GPUS); + amdsmi_status_t ret; + uint64_t init_flag_; + //initialize CPU and GPU instances + init_flag_ = AMDSMI_INIT_AMD_GPUS | AMDSMI_INIT_AMD_CPUS; + ret = amdsmi_init(init_flag_); if (ret != AMDSMI_STATUS_SUCCESS) { throw amd::rdc::RdcException(RDC_ST_FAIL_LOAD_MODULE, "SMI initialize fail"); } @@ -193,6 +197,28 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_all(uint32_t gpu_index_list[RDC_ return RDC_ST_OK; } +// Discovery API +rdc_status_t RdcEmbeddedHandler::rdc_device_get_all_cpu(uint32_t cpu_index_list[RDC_MAX_NUM_DEVICES], + uint32_t* count) { + if (!count) { + return RDC_ST_BAD_PARAMETER; + } + + rdc_field_value device_count; + rdc_status_t status = metric_fetcher_->fetch_smi_cpu_field(0, RDC_FI_DEV_CPU_COUNT, &device_count); + if (status != RDC_ST_OK) { + std::cout << "rdc_device_get_all_cpu failed to get cpu count"; + return status; + } + + // Assign the index to the index list + *count = device_count.value.l_int; + for (uint32_t i = 0; i < *count; i++) { + cpu_index_list[i] = i; + } + + return RDC_ST_OK; +} rdc_status_t RdcEmbeddedHandler::rdc_device_get_attributes(uint32_t gpu_index, rdc_device_attributes_t* p_rdc_attr) { if (!p_rdc_attr) { @@ -204,6 +230,19 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_attributes(uint32_t gpu_index, return status; } +rdc_status_t RdcEmbeddedHandler::rdc_device_get_cpu_attributes(uint32_t cpu_index, + rdc_device_attributes_t* p_rdc_attr) { + if (!p_rdc_attr) { + return RDC_ST_BAD_PARAMETER; + } + rdc_field_value device_name; + + rdc_status_t status = metric_fetcher_->fetch_smi_cpu_field(cpu_index, RDC_FI_DEV_CPU_MODEL, &device_name); + strncpy_with_null(p_rdc_attr->device_name, device_name.value.str, RDC_MAX_STR_LENGTH); + + return status; +} + rdc_status_t RdcEmbeddedHandler::rdc_device_get_component_version( rdc_component_t component, rdc_component_version_t* p_rdc_compv) { if (!p_rdc_compv) { diff --git a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index 84080cf042..9969d5997a 100644 --- a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -1295,5 +1295,66 @@ rdc_status_t RdcMetricFetcherImpl::acquire_smi_handle(RdcFieldKey fk) { return ret; } +rdc_status_t RdcMetricFetcherImpl::fetch_smi_cpu_field(uint32_t cpu_index, rdc_field_t field_id, + rdc_field_value* value) { + amdsmi_status_t ret; + amdsmi_processor_handle processor_handle = {}; + + ret = amdsmi_init(AMDSMI_INIT_AMD_CPUS); + if (ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "Fail to init amdsmi for CPU"); + return Smi2RdcError(ret); + } + + if (!value) { + return RDC_ST_BAD_PARAMETER; + } + + if (!is_field_valid(field_id)) { + RDC_LOG(RDC_ERROR, "Fail to fetch CPU field " << field_id << " which is not supported"); + return RDC_ST_NOT_SUPPORTED; + } + + ret = get_processor_handle_from_id(cpu_index, &processor_handle); + if (ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "Failed to get processor handle for GPU " << cpu_index << " error: " << ret); + return Smi2RdcError(ret); + } + + value->ts = now(); + value->field_id = field_id; + value->status = AMDSMI_STATUS_NOT_SUPPORTED; + + switch (field_id) { + case RDC_FI_DEV_CPU_COUNT: { + uint32_t processor_count = 0; + value->status = get_processor_count(processor_count); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(processor_count); + } + + value->type = INTEGER; + value->status = Smi2RdcError(ret); + break; + } + case RDC_FI_DEV_CPU_MODEL: { + amdsmi_cpu_info_t cpu_info; + value->status = amdsmi_get_cpu_model_name(processor_handle, &cpu_info); + value->type = STRING; + if (value->status == AMDSMI_STATUS_SUCCESS) { + memcpy(value->value.str, cpu_info.model_name, sizeof(cpu_info.model_name)); + } + break; + } + default: { + RDC_LOG(RDC_ERROR, "field_id is not supported: " << field_id); + break; + } + } + + return RDC_ST_OK; +} + } // namespace rdc } // namespace amd diff --git a/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc b/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc index db75d634e6..c2876da3c2 100644 --- a/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc +++ b/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc @@ -234,6 +234,30 @@ rdc_status_t RdcStandaloneHandler::rdc_device_get_all(uint32_t gpu_index_list[RD return RDC_ST_OK; } +rdc_status_t RdcStandaloneHandler::rdc_device_get_all_cpu(uint32_t cpu_index_list[RDC_MAX_NUM_DEVICES], + uint32_t* count) { + if (!count) { + return RDC_ST_BAD_PARAMETER; + } + ::rdc::Empty request; + ::rdc::GetAllCpuDevicesResponse reply; + ::grpc::ClientContext context; + + ::grpc::Status status = stub_->GetAllCpuDevices(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + if (err_status != RDC_ST_OK) return err_status; + + if (reply.cpus_size() > RDC_MAX_NUM_DEVICES) { + return RDC_ST_BAD_PARAMETER; + } + + *count = reply.cpus_size(); + for (uint32_t i = 0; i < *count; i++) { + cpu_index_list[i] = reply.cpus(i); + } + + return RDC_ST_OK; +} rdc_status_t RdcStandaloneHandler::rdc_device_get_attributes(uint32_t gpu_index, rdc_device_attributes_t* p_rdc_attr) { if (!p_rdc_attr) { @@ -254,6 +278,27 @@ rdc_status_t RdcStandaloneHandler::rdc_device_get_attributes(uint32_t gpu_index, return RDC_ST_OK; } +rdc_status_t RdcStandaloneHandler::rdc_device_get_cpu_attributes(uint32_t cpu_index, + rdc_device_attributes_t* p_rdc_attr) { + if (!p_rdc_attr) { + return RDC_ST_BAD_PARAMETER; + } + ::rdc::GetCpuDeviceAttributesRequest request; + ::rdc::GetCpuDeviceAttributesResponse reply; + ::grpc::ClientContext context; + + request.set_cpu_index(cpu_index); + + ::grpc::Status status = stub_->GetDeviceCpuAttributes(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + if (err_status != RDC_ST_OK) return err_status; + + strncpy_with_null(p_rdc_attr->device_name, reply.attributes().device_name().c_str(), + RDC_MAX_STR_LENGTH); + + return RDC_ST_OK; +} + rdc_status_t RdcStandaloneHandler::rdc_device_get_component_version( rdc_component_t component, rdc_component_version_t* p_rdc_compv) { if (!p_rdc_compv) { diff --git a/rdci/src/RdciDiscoverySubSystem.cc b/rdci/src/RdciDiscoverySubSystem.cc index 02ca20964a..1d55e9507d 100644 --- a/rdci/src/RdciDiscoverySubSystem.cc +++ b/rdci/src/RdciDiscoverySubSystem.cc @@ -148,6 +148,48 @@ void RdciDiscoverySubSystem::show_attributes() { std::cout << i << "\t\t" << attribute.device_name << std::endl; } } + std::cout << "=====================================================" + << "============\n"; + uint32_t cpu_index_list[RDC_MAX_NUM_DEVICES]; + uint32_t cpu_count = 0; + + rdc_status_t cpu_result = rdc_device_get_all_cpu(rdc_handle_, cpu_index_list, &cpu_count); + if (cpu_result != RDC_ST_OK) { + throw RdcException(cpu_result, "Fail to get CPU device information"); + } + if (cpu_count == 0) { + if (is_json_output()) { + std::cout << "\"cpus\" : [], \"status\": \"ok\""; + } else { + std::cout << "No CPUs find on the system\n"; + } + return; + } + + if (is_json_output()) { + std::cout << "\"cpus\" : ["; + } else { + std::cout << cpu_count << " CPUs found.\n"; + std::cout << "------------------------------------------------" + << "-----------------\n"; + std::cout << "CPU Index\t Device Information\n"; + } + for (uint32_t i = 0; i < cpu_count; i++) { + rdc_device_attributes_t cpu_attribute; + cpu_result = rdc_device_get_cpu_attributes(rdc_handle_, cpu_index_list[i], &cpu_attribute); + if (cpu_result != RDC_ST_OK) { + return; + } + if (is_json_output()) { + std::cout << "{\"cpu_index\": \"" << i << "\", \"device_name\": \"" << cpu_attribute.device_name + << "\"}"; + if (i != cpu_count - 1) { + std::cout << ","; + } + } else { + std::cout << i << "\t\t" << cpu_attribute.device_name << std::endl; + } + } if (is_json_output()) { std::cout << ']'; } else { diff --git a/server/include/rdc/rdc_api_service.h b/server/include/rdc/rdc_api_service.h index 55e11c6ca9..78d988b038 100644 --- a/server/include/rdc/rdc_api_service.h +++ b/server/include/rdc/rdc_api_service.h @@ -43,10 +43,17 @@ class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service { ::grpc::Status GetAllDevices(::grpc::ServerContext* context, const ::rdc::Empty* request, ::rdc::GetAllDevicesResponse* reply) override; + ::grpc::Status GetAllCpuDevices(::grpc::ServerContext* context, const ::rdc::Empty* request, + ::rdc::GetAllCpuDevicesResponse* reply) override; + ::grpc::Status GetDeviceAttributes(::grpc::ServerContext* context, const ::rdc::GetDeviceAttributesRequest* request, ::rdc::GetDeviceAttributesResponse* reply) override; + ::grpc::Status GetDeviceCpuAttributes(::grpc::ServerContext* context, + const ::rdc::GetCpuDeviceAttributesRequest* request, + ::rdc::GetCpuDeviceAttributesResponse* reply) override; + ::grpc::Status GetComponentVersion(::grpc::ServerContext* context, const ::rdc::GetComponentVersionRequest* request, ::rdc::GetComponentVersionResponse* reply) override; diff --git a/server/src/rdc_api_service.cc b/server/src/rdc_api_service.cc index 466c3e13eb..dd5a0ccf2b 100644 --- a/server/src/rdc_api_service.cc +++ b/server/src/rdc_api_service.cc @@ -99,6 +99,28 @@ RdcAPIServiceImpl::~RdcAPIServiceImpl() { return ::grpc::Status::OK; } +::grpc::Status RdcAPIServiceImpl::GetAllCpuDevices(::grpc::ServerContext* context, + const ::rdc::Empty* request, + ::rdc::GetAllCpuDevicesResponse* reply) { +(void)(context); +(void)(request); +if (!reply) { +return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty reply"); +} +uint32_t cpu_index_list[RDC_MAX_NUM_DEVICES]; +uint32_t count = 0; +rdc_status_t result = rdc_device_get_all_cpu(rdc_handle_, cpu_index_list, &count); +reply->set_status(result); +if (result != RDC_ST_OK) { +return ::grpc::Status::OK; +} +for (uint32_t i = 0; i < count; i++) { +reply->add_cpus(cpu_index_list[i]); +} + +return ::grpc::Status::OK; +} + ::grpc::Status RdcAPIServiceImpl::GetDeviceAttributes( ::grpc::ServerContext* context, const ::rdc::GetDeviceAttributesRequest* request, ::rdc::GetDeviceAttributesResponse* reply) { @@ -118,6 +140,25 @@ RdcAPIServiceImpl::~RdcAPIServiceImpl() { return ::grpc::Status::OK; } +::grpc::Status RdcAPIServiceImpl::GetDeviceCpuAttributes( + ::grpc::ServerContext* context, const ::rdc::GetCpuDeviceAttributesRequest* request, + ::rdc::GetCpuDeviceAttributesResponse* reply) { +(void)(context); +if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); +} +uint32_t cpu_index = request->cpu_index(); +rdc_device_attributes_t attribute; +rdc_status_t result = rdc_device_get_cpu_attributes(rdc_handle_, cpu_index, &attribute); + +::rdc::DeviceAttributes* attr = reply->mutable_attributes(); +attr->set_device_name(attribute.device_name); + +reply->set_status(result); + +return ::grpc::Status::OK; +} + ::grpc::Status RdcAPIServiceImpl::GetComponentVersion( ::grpc::ServerContext* context, const ::rdc::GetComponentVersionRequest* request, ::rdc::GetComponentVersionResponse* reply) {