From fe868f6763116fa67f95910fadbfc88cb1c8b258 Mon Sep 17 00:00:00 2001 From: "Pryor, Adam" Date: Thu, 27 Mar 2025 14:10:11 -0500 Subject: [PATCH] [SWDEV-498711] RDC Partition Implementation (#119) * [SWDEV-498711] RDC Partition Implementation Change-Id: Ibfc3709793770537e4c9d36458f34c6b4f461724 Signed-off-by: adapryor [ROCm/rdc commit: 47692d3ed5ef6442bfdaaa24a3d74fad9d5a0855] --- projects/rdc/include/rdc/rdc.h | 138 ++++++ projects/rdc/include/rdc_lib/RdcEntityCodec.h | 54 +++ projects/rdc/include/rdc_lib/RdcHandler.h | 8 +- projects/rdc/include/rdc_lib/RdcPartition.h | 47 ++ .../include/rdc_lib/impl/RdcEmbeddedHandler.h | 11 +- .../rdc_lib/impl/RdcGroupSettingsImpl.h | 4 +- .../include/rdc_lib/impl/RdcPartitionImpl.h | 44 ++ .../rdc_lib/impl/RdcStandaloneHandler.h | 8 +- projects/rdc/include/rdc_lib/impl/SmiUtils.h | 8 + projects/rdc/protos/rdc.proto | 32 ++ projects/rdc/python_binding/RdcReader.py | 46 +- .../rdc/rdc_libs/bootstrap/CMakeLists.txt | 4 +- .../rdc_libs/bootstrap/src/RdcBootStrap.cc | 21 +- .../rdc_libs/bootstrap/src/RdcEntityCodec.cc | 111 +++++ projects/rdc/rdc_libs/rdc/CMakeLists.txt | 2 + .../rdc_libs/rdc/src/RdcEmbeddedHandler.cc | 20 +- .../rdc_libs/rdc/src/RdcGroupSettingsImpl.cc | 59 ++- .../rdc_libs/rdc/src/RdcMetricFetcherImpl.cc | 155 ++++++- .../rdc/rdc_libs/rdc/src/RdcPartitionImpl.cc | 117 +++++ .../rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc | 439 +++++++----------- projects/rdc/rdc_libs/rdc/src/SmiUtils.cc | 136 ++++-- .../rdc_client/src/RdcStandaloneHandler.cc | 36 ++ projects/rdc/rdci/CMakeLists.txt | 3 +- .../rdc/rdci/include/RdciDiscoverySubSystem.h | 2 + projects/rdc/rdci/include/RdciDmonSubSystem.h | 4 + .../rdc/rdci/src/RdciDiscoverySubSystem.cc | 189 +++++++- projects/rdc/rdci/src/RdciDmonSubSystem.cc | 106 ++++- .../rdc/server/include/rdc/rdc_api_service.h | 8 + projects/rdc/server/src/rdc_api_service.cc | 49 +- 29 files changed, 1503 insertions(+), 358 deletions(-) create mode 100644 projects/rdc/include/rdc_lib/RdcEntityCodec.h create mode 100644 projects/rdc/include/rdc_lib/RdcPartition.h create mode 100644 projects/rdc/include/rdc_lib/impl/RdcPartitionImpl.h create mode 100644 projects/rdc/rdc_libs/bootstrap/src/RdcEntityCodec.cc create mode 100644 projects/rdc/rdc_libs/rdc/src/RdcPartitionImpl.cc diff --git a/projects/rdc/include/rdc/rdc.h b/projects/rdc/include/rdc/rdc.h index d3963e6040..a5625dd61b 100644 --- a/projects/rdc/include/rdc/rdc.h +++ b/projects/rdc/include/rdc/rdc.h @@ -129,6 +129,11 @@ typedef enum { INTEGER = 0, DOUBLE, STRING, BLOB } rdc_field_type_t; */ #define RDC_MAX_NUM_DEVICES 128 +/** + * @brief Max number of partitions + */ +#define RDC_MAX_NUM_PARTITIONS 8 + /** * @brief The max fields in a field group */ @@ -1617,6 +1622,139 @@ rdc_status_t rdc_config_clear(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_i const char* get_rocm_path(const char* search_string); +/** + * @brief The device role + */ +typedef enum { + RDC_DEVICE_ROLE_PHYSICAL, + RDC_DEVICE_ROLE_PARTITION_INSTANCE //!< The partition instance +} rdc_device_role_t; + +/** + * @brief The device type + */ +typedef enum { RDC_DEVICE_TYPE_GPU, RDC_DEVICE_TYPE_CPU } rdc_device_type_t; + +typedef struct { + uint32_t device_index; //!< Physical device index + uint32_t instance_index; //!< Instance or core index + rdc_device_role_t entity_role; //!< Physical device or partition instance + rdc_device_type_t device_type; //!< Type +} rdc_entity_info_t; + +/** + * @brief The function to decode the entity info from entity index + * @details + * | 31 30 29| 28 27 | 21 20 19 ... 12 11 | 10 9 8 7 6 5 4 3 2 1 0 | + * |---------|-------|--------------------|---------------------------| + * | Type | Role | Instance | Device | + * |---------|-------|--------------------|---------------------------| + * the 32 bit entity index is crafted based on above structure, this function + * will decode them into a data structure + * + * @param[in] entity_index The entity index. + * + * @retval rdc_entity_info_t is returned for decode structure + */ + +rdc_entity_info_t rdc_get_info_from_entity_index(uint32_t entity_index); + +/** + * @brief The function to encode the entity info to entity index + * @details + * | 31 30 29| 28 27 | 21 20 19 ... 12 11 | 10 9 8 7 6 5 4 3 2 1 0 | + * |---------|-------|--------------------|---------------------------| + * | Type | Role | Instance | Device | + * |---------|-------|--------------------|---------------------------| + * the 32 bit entity index is crafted based on above structure, this function + * will encode them to index + * + * @param[in] info The entity info to encode. + * + * @retval entity_index is returned + */ +uint32_t rdc_get_entity_index_from_info(rdc_entity_info_t info); + +// map from amdsmi_accelerator_partition_resource_type_t +typedef enum { + RDC_ACCELERATOR_XCC = 0, + RDC_ACCELERATOR_ENCODER, + RDC_ACCELERATOR_DECODER, + RDC_ACCELERATOR_DMA, + RDC_ACCELERATOR_JPEG, + RDC_ACCELERATOR_RESOURCE_MAX, + RDC_ACCELERATOR_LAST = RDC_ACCELERATOR_RESOURCE_MAX +} rdc_instance_resource_type_t; + +// map from amdsmi_accelerator_partition_resource_profile_t +typedef struct { + rdc_instance_resource_type_t resource_type; + uint32_t partition_resource; // The resources a partition can be used, which may be shared + uint32_t num_partitions_share_resource; // If it is greater than 1, then resource is shared. +} rdc_resource_profile_t; + +/** + * @brief Query the resource allocation for a device/instance + * + * @details The profile contains detail information how resource is allocated. + * + * As an example, MI300X has 8 XCCs and 4 Decoders, in DPX mode, the physical device is + * partitioned to 2 instances, so each instance will have 4 XCC and 2 Decoder and they are + * not shared. + * [XCC, 4, 0], [DECODER, 2, 0] + * + * If it is CPX mode, the physical device is partitioned to 8 instances, and each instance + * have 1 XCC and 2 instances are sharing the same decoder. + * [XCC, 1, 0], [DECODER, 1, 1] + * + * If entity_index is the physical device, it should return all resources of the device: + * [XCC, 8, 0], [DECODER, 4, 0] + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] entity_index The GPU index to query. It can be physical device or instance. + * + * @param[in] resource_type Which resource type to query + * + * @param[out] profile The details how the resource is allocated. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_instance_profile_get(rdc_handle_t p_rdc_handle, uint32_t entity_index, + rdc_instance_resource_type_t resource_type, + rdc_resource_profile_t* profile); + +/** + * @brief Get the number of partitions for the specified GPU index. + * + * @param[in] p_rdc_handle The RDC handler. + * @param[in] index The GPU index to query. + * @param[out] num_partition Pointer to a variable to receive the number of partitions. + * + * @retval ::RDC_ST_OK on success. + */ +rdc_status_t rdc_get_num_partition(rdc_handle_t p_rdc_handle, uint32_t index, + uint16_t* num_partition); + +/** + * @brief Check if gpuid is partition string + * + * @param[in] s - singular partition string + * @retval bool - if partition string or not + */ +bool rdc_is_partition_string(const char* s); + +/** + * @brief Parse partition id into physical gpu and partition + * + * @param[in] s - singular partition string + * @param[out] physicalGpu - socket id + * @param[out] partition - partition id + * + * @retval bool - success + */ +bool rdc_parse_partition_string(const char* s, uint32_t* physicalGpu, uint32_t* partition); + #ifdef __cplusplus } #endif // __cplusplus diff --git a/projects/rdc/include/rdc_lib/RdcEntityCodec.h b/projects/rdc/include/rdc_lib/RdcEntityCodec.h new file mode 100644 index 0000000000..2bc596171f --- /dev/null +++ b/projects/rdc/include/rdc_lib/RdcEntityCodec.h @@ -0,0 +1,54 @@ +/* +Copyright (c) 2025 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef INCLUDE_RDC_LIB_RDCENTITYCODEC_H_ +#define INCLUDE_RDC_LIB_RDCENTITYCODEC_H_ + +#include "rdc/rdc.h" + +/* + * + * See rdc.h for description of entity_index + * Shifts and masks help get only the bits in question to decode/encode + * + * Ex, RDC_ENTITY_TYPE_SHIFT = 29 helps shift the 29 irrelevant bits, so we're + * only left with the top 3 type bits. + * Then, the corresponding 3 type bits are anded with the RDC_ENTITY_TYPE_MASK = 0x7 + * which = 111 in binary, "copying" the type bits. + * + * + */ +static constexpr uint32_t RDC_ENTITY_TYPE_SHIFT = 29; +static constexpr uint32_t RDC_ENTITY_ROLE_SHIFT = 27; +static constexpr uint32_t RDC_ENTITY_INSTANCE_SHIFT = 11; +static constexpr uint32_t RDC_ENTITY_DEVICE_SHIFT = 0; + +static constexpr uint32_t RDC_ENTITY_TYPE_MASK = 0x7; // 3 bits for type. +static constexpr uint32_t RDC_ENTITY_ROLE_MASK = 0x3; // 2 bits for role. +static constexpr uint32_t RDC_ENTITY_INSTANCE_MASK = 0x3FF; // 10 bits for instance. +static constexpr uint32_t RDC_ENTITY_DEVICE_MASK = 0x3FF; // 10 bits for device. + +rdc_entity_info_t rdc_get_info_from_entity_index(uint32_t entity_index); +uint32_t rdc_get_entity_index_from_info(rdc_entity_info_t info); +bool rdc_is_partition_string(const char* s); +bool rdc_parse_partition_string(const char* s, uint32_t* physicalGpu, uint32_t* partition); + +#endif // INCLUDE_RDC_LIB_RDCENTITYCODEC_H_ diff --git a/projects/rdc/include/rdc_lib/RdcHandler.h b/projects/rdc/include/rdc_lib/RdcHandler.h index e03a7410f1..1e391be7b2 100644 --- a/projects/rdc/include/rdc_lib/RdcHandler.h +++ b/projects/rdc/include/rdc_lib/RdcHandler.h @@ -116,7 +116,7 @@ class RdcHandler { virtual rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t* response) = 0; virtual rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) = 0; - // topology API + // topology API virtual rdc_status_t rdc_device_topology_get(uint32_t gpu_index, rdc_device_topology_t* results) = 0; virtual rdc_status_t rdc_link_status_get(rdc_link_status_t* results) = 0; @@ -131,6 +131,12 @@ class RdcHandler { // Clear the setting virtual rdc_status_t rdc_config_clear(rdc_gpu_group_t group_id) = 0; + virtual rdc_status_t rdc_get_num_partition(uint32_t index, uint16_t* num_partition) = 0; + + virtual rdc_status_t rdc_instance_profile_get(uint32_t entity_index, + rdc_instance_resource_type_t resource_type, + rdc_resource_profile_t* profile) = 0; + virtual ~RdcHandler() {} }; diff --git a/projects/rdc/include/rdc_lib/RdcPartition.h b/projects/rdc/include/rdc_lib/RdcPartition.h new file mode 100644 index 0000000000..0a555e133d --- /dev/null +++ b/projects/rdc/include/rdc_lib/RdcPartition.h @@ -0,0 +1,47 @@ +/* +Copyright (c) 2025 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef INCLUDE_RDC_LIB_RDCPARTITION_H_ +#define INCLUDE_RDC_LIB_RDCPARTITION_H_ + +#include + +#include "rdc/rdc.h" + +namespace amd { +namespace rdc { + +class RdcPartition { + public: + virtual rdc_status_t rdc_instance_profile_get_impl(uint32_t entity_index, + rdc_instance_resource_type_t resource_type, + rdc_resource_profile_t* profile) = 0; + + virtual rdc_status_t rdc_get_num_partition_impl(uint32_t index, uint16_t* num_partition) = 0; + + virtual ~RdcPartition() {} +}; +typedef std::shared_ptr RdcPartitionPtr; + +} // namespace rdc +} // namespace amd + +#endif // INCLUDE_RDC_LIB_RDCPARTITION_H_ diff --git a/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h b/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h index d82d529d2f..afdc04b293 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h +++ b/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h @@ -32,6 +32,7 @@ THE SOFTWARE. #include "rdc_lib/RdcMetricsUpdater.h" #include "rdc_lib/RdcModuleMgr.h" #include "rdc_lib/RdcNotification.h" +#include "rdc_lib/RdcPartition.h" #include "rdc_lib/RdcPolicy.h" #include "rdc_lib/RdcTopologyLink.h" #include "rdc_lib/RdcWatchTable.h" @@ -121,7 +122,7 @@ class RdcEmbeddedHandler final : public RdcHandler { rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t* response) override; rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override; rdc_status_t rdc_device_topology_get(uint32_t gpu_index, rdc_device_topology_t* results) override; - + rdc_status_t rdc_link_status_get(rdc_link_status_t* results) override; // Set one configure @@ -134,11 +135,18 @@ class RdcEmbeddedHandler final : public RdcHandler { // Clear the setting rdc_status_t rdc_config_clear(rdc_gpu_group_t group_id) override; + rdc_status_t rdc_get_num_partition(uint32_t index, uint16_t* num_partition) override; + + rdc_status_t rdc_instance_profile_get(uint32_t entity_index, + rdc_instance_resource_type_t resource_type, + rdc_resource_profile_t* profile) override; + explicit RdcEmbeddedHandler(rdc_operation_mode_t op_mode); ~RdcEmbeddedHandler() final; private: rdc_status_t get_gpu_gauges(rdc_gpu_gauges_t* gpu_gauges); + RdcPartitionPtr partition_; RdcGroupSettingsPtr group_settings_; RdcCacheManagerPtr cache_mgr_; RdcMetricFetcherPtr metric_fetcher_; @@ -150,7 +158,6 @@ class RdcEmbeddedHandler final : public RdcHandler { RdcTopologyLinkPtr topologylink_; RdcConfigSettingsPtr config_handler_; std::future updater_; - }; } // namespace rdc diff --git a/projects/rdc/include/rdc_lib/impl/RdcGroupSettingsImpl.h b/projects/rdc/include/rdc_lib/impl/RdcGroupSettingsImpl.h index 268d8e0077..3d2533ee56 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcGroupSettingsImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcGroupSettingsImpl.h @@ -28,6 +28,7 @@ THE SOFTWARE. #include #include "rdc_lib/RdcGroupSettings.h" +#include "rdc_lib/impl/RdcPartitionImpl.h" namespace amd { namespace rdc { @@ -51,7 +52,7 @@ class RdcGroupSettingsImpl : public RdcGroupSettings { rdc_status_t rdc_group_field_get_all_ids(rdc_field_grp_t field_group_id_list[], uint32_t* count) override; - RdcGroupSettingsImpl(); + explicit RdcGroupSettingsImpl(const RdcPartitionPtr& partition); private: std::map gpu_group_; @@ -60,6 +61,7 @@ class RdcGroupSettingsImpl : public RdcGroupSettings { uint32_t cur_field_group_id_ = 0; std::mutex group_mutex_; std::mutex field_group_mutex_; + RdcPartitionPtr partition_; }; } // namespace rdc diff --git a/projects/rdc/include/rdc_lib/impl/RdcPartitionImpl.h b/projects/rdc/include/rdc_lib/impl/RdcPartitionImpl.h new file mode 100644 index 0000000000..f4fdb58422 --- /dev/null +++ b/projects/rdc/include/rdc_lib/impl/RdcPartitionImpl.h @@ -0,0 +1,44 @@ +/* +Copyright (c) 2025 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef INCLUDE_RDC_LIB_IMPL_RDCPARTITIONIMPL_H_ +#define INCLUDE_RDC_LIB_IMPL_RDCPARTITIONIMPL_H_ + +#include + +#include "rdc/rdc.h" +#include "rdc_lib/RdcPartition.h" + +namespace amd { +namespace rdc { + +class RdcPartitionImpl : public RdcPartition { + public: + rdc_status_t rdc_instance_profile_get_impl(uint32_t entity_index, + rdc_instance_resource_type_t resource_type, + rdc_resource_profile_t* profile); + rdc_status_t rdc_get_num_partition_impl(uint32_t index, uint16_t* num_partition); +}; + +} // namespace rdc +} // namespace amd + +#endif // INCLUDE_RDC_LIB_IMPL_RDCPARTITIONIMPL_H_ diff --git a/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h b/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h index b90f0ac6e8..96d9fdfc5c 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h +++ b/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h @@ -126,9 +126,15 @@ class RdcStandaloneHandler : public RdcHandler { rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t* response) override; rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override; rdc_status_t rdc_device_topology_get(uint32_t gpu_index, rdc_device_topology_t* results) override; - + rdc_status_t rdc_link_status_get(rdc_link_status_t* results) override; + rdc_status_t rdc_get_num_partition(uint32_t index, uint16_t* num_partition) override; + + rdc_status_t rdc_instance_profile_get(uint32_t entity_index, + rdc_instance_resource_type_t resource_type, + rdc_resource_profile_t* profile) override; + explicit RdcStandaloneHandler(const char* ip_and_port, const char* root_ca, const char* client_cert, const char* client_key); diff --git a/projects/rdc/include/rdc_lib/impl/SmiUtils.h b/projects/rdc/include/rdc_lib/impl/SmiUtils.h index 0ea34d44e9..7c8c06da6e 100644 --- a/projects/rdc/include/rdc_lib/impl/SmiUtils.h +++ b/projects/rdc/include/rdc_lib/impl/SmiUtils.h @@ -23,6 +23,8 @@ THE SOFTWARE. #ifndef INCLUDE_RDC_LIB_IMPL_RSMIUTILS_H_ #define INCLUDE_RDC_LIB_IMPL_RSMIUTILS_H_ +#include + #include "amd_smi/amdsmi.h" #include "rdc/rdc.h" @@ -33,6 +35,12 @@ rdc_status_t Smi2RdcError(amdsmi_status_t rsmi); amdsmi_status_t get_processor_handle_from_id(uint32_t gpu_id, amdsmi_processor_handle* processor_handle); amdsmi_status_t get_processor_count(uint32_t& all_processor_count); +amdsmi_status_t get_socket_handles(std::vector& sockets); +amdsmi_status_t get_processor_handles(amdsmi_socket_handle socket, + std::vector& processors); +amdsmi_status_t get_kfd_partition_id(amdsmi_processor_handle proc, uint32_t* partition_id); +amdsmi_status_t get_metrics_info(amdsmi_processor_handle proc, amdsmi_gpu_metrics_t* metrics); +amdsmi_status_t get_num_partition(uint32_t index, uint16_t* num_partition); } // namespace rdc } // namespace amd diff --git a/projects/rdc/protos/rdc.proto b/projects/rdc/protos/rdc.proto index d255a122aa..94f19bcb3b 100755 --- a/projects/rdc/protos/rdc.proto +++ b/projects/rdc/protos/rdc.proto @@ -219,6 +219,13 @@ service RdcAPI { // rdc_status_t GetLinkStatus() rpc GetLinkStatus(Empty) returns (GetLinkStatusResponse) {} + + // Get number of partitions + rpc GetNumPartition(GetNumPartitionRequest) returns (GetNumPartitionResponse); + + // Get instance profile of gpu + rpc GetInstanceProfile(GetInstanceProfileRequest) returns (GetInstanceProfileResponse); + } message Empty { @@ -804,3 +811,28 @@ message ClearConfigRequest { message ClearConfigResponse { uint32 status = 1; } + +// Request for getting the number of partitions for a given GPU index. +message GetNumPartitionRequest { + // The GPU index for which to query the number of partitions. + uint32 gpu_index = 1; +} + +// Response for getting the number of partitions. +message GetNumPartitionResponse { + // Status of the operation, following RDC_ST_* codes. + uint32 status = 1; + // Number of partitions for the given GPU. + uint32 num_partition = 2; +} + +message GetInstanceProfileRequest { + uint32 entity_index = 1; + uint32 resource_type = 2; +} + +message GetInstanceProfileResponse { + uint32 status = 1; + uint32 partition_resource = 2; + uint32 num_partitions_share_resource = 3; +} diff --git a/projects/rdc/python_binding/RdcReader.py b/projects/rdc/python_binding/RdcReader.py index 0e47d1ff6e..57dc5e9163 100644 --- a/projects/rdc/python_binding/RdcReader.py +++ b/projects/rdc/python_binding/RdcReader.py @@ -1,6 +1,7 @@ import os,time from rdc_bootstrap import * from RdcUtil import RdcUtil +from typing import Dict default_field_ids = [ rdc_field_t.RDC_FI_GPU_MEMORY_USAGE, @@ -26,10 +27,18 @@ default_unit_coverter = { rdc_field_t.RDC_FI_GPU_TEMP: 0.001, # degree } +class rdc_entity_info_t(Structure): + _fields_ = [ + ("device_type", c_uint32), + ("entity_role", c_uint32), + ("instance_index", c_uint32), + ("device_index", c_uint32), + ] + class RdcReader: # To run the RDC in embedded mode, set the ip_port = None def __init__(self, ip_port = "localhost:50051", field_ids = default_field_ids, - unit_converter: dict[int, float] = default_unit_coverter, + unit_converter: Dict[int, float] = default_unit_coverter, update_freq = 10000000, max_keep_age = 3600.0 , max_keep_samples = 1000, field_group_name = "rdc_reader_field_group", gpu_group_name = "rdc_reader_gpu_group", gpu_indexes = None, root_ca = "/etc/rdc/client/certs/rdc_cacert.pem", @@ -44,6 +53,11 @@ class RdcReader: self.unit_converter = unit_converter self.rdc_handle = c_void_p() + rdc.rdc_get_entity_index_from_info.argtypes = [rdc_entity_info_t] + rdc.rdc_get_entity_index_from_info.restype = c_uint32 + rdc.rdc_get_info_from_entity_index.argtypes = [c_uint32] + rdc.rdc_get_info_from_entity_index.restype = rdc_entity_info_t + self.is_standalone = True if not ip_port: # embedded self.is_standalone = False @@ -69,7 +83,25 @@ class RdcReader: if gpu_indexes == None: self.gpu_indexes = self.rdc_util.get_all_gpu_indexes(self.rdc_handle) else: - self.gpu_indexes = gpu_indexes + self.gpu_indexes = [] + for idx in gpu_indexes: + idx_str = str(idx) + encoded = idx_str.encode("utf-8") + phys_gpu = ctypes.c_uint32() + part_idx = ctypes.c_uint32() + if rdc.rdc_is_partition_string(encoded): + rc = rdc.rdc_parse_partition_string(encoded, ctypes.byref(phys_gpu), ctypes.byref(part_idx)) + if not rc: + raise Exception("Rdc failed to parse partition string") + info = rdc_entity_info_t() + info.device_type = 0 #RDC_DEVICE_TYPE_GPU + info.entity_role = 1 #RDC_DEVICE_ROLE_PARTITION + info.instance_index = part_idx + info.device_index = phys_gpu + entity = rdc.rdc_get_entity_index_from_info(info) + self.gpu_indexes.append(entity) + else: + self.gpu_indexes.append(int(idx_str)) self.gpu_group_id, gpu_group_created = self.rdc_util.create_gpu_group(self.rdc_handle, self.gpu_group_name, self.gpu_indexes) # Create the field group @@ -140,8 +172,16 @@ class RdcReader: def handle_field(self, gpu_index, value): + + info = rdc.rdc_get_info_from_entity_index(gpu_index) + + if info.entity_role == 1: #RDC_DEVICE_ROLE_PARTITION_INSTANCE + gpu_str = f"g{info.device_index}.{info.instance_index}" + else: + gpu_str = str(info.device_index) + field_name = self.rdc_util.field_id_string(value.field_id) - print("%d %d:%d %s:%d" % (value.ts, gpu_index, value.field_id.value, field_name, value.value.l_int)) + print("%d %s:%d %s:%d" % (value.ts, gpu_str, value.field_id.value, field_name, value.value.l_int)) if __name__ == '__main__': diff --git a/projects/rdc/rdc_libs/bootstrap/CMakeLists.txt b/projects/rdc/rdc_libs/bootstrap/CMakeLists.txt index c3e6768d7c..41c06b1cf5 100644 --- a/projects/rdc/rdc_libs/bootstrap/CMakeLists.txt +++ b/projects/rdc/rdc_libs/bootstrap/CMakeLists.txt @@ -12,12 +12,14 @@ set(BOOTSTRAP_LIB_SRC_LIST "${COMMON_DIR}/rdc_fields_supported.cc" "${SRC_DIR}/RdcBootStrap.cc" "${SRC_DIR}/RdcLibraryLoader.cc" - "${SRC_DIR}/RdcLogger.cc") + "${SRC_DIR}/RdcLogger.cc" + "${SRC_DIR}/RdcEntityCodec.cc") set(BOOTSTRAP_LIB_INC_LIST "${COMMON_DIR}/rdc_fields_supported.h" "${INC_DIR}/RdcHandler.h" "${INC_DIR}/RdcLibraryLoader.h" "${INC_DIR}/RdcLogger.h" + "${INC_DIR}/RdcEntityCodec.h" "${INC_DIR}/rdc_common.h" "${PROJECT_SOURCE_DIR}/include/rdc/rdc.h") message("BOOTSTRAP_LIB_INC_LIST=${BOOTSTRAP_LIB_INC_LIST}") diff --git a/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc b/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc index 63751b87b5..e068fb2784 100644 --- a/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc +++ b/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc @@ -532,8 +532,26 @@ rdc_status_t rdc_link_status_get(rdc_handle_t p_rdc_handle, rdc_link_status_t* r if (!p_rdc_handle) { return RDC_ST_INVALID_HANDLER; } + return static_cast(p_rdc_handle)->rdc_link_status_get(results); +} + +rdc_status_t rdc_get_num_partition(rdc_handle_t p_rdc_handle, uint32_t index, + uint16_t* num_partition) { + if (!p_rdc_handle || !num_partition) { + return RDC_ST_INVALID_HANDLER; + } return static_cast(p_rdc_handle) - ->rdc_link_status_get(results); + ->rdc_get_num_partition(index, num_partition); +} + +rdc_status_t rdc_instance_profile_get(rdc_handle_t p_rdc_handle, uint32_t entity_index, + rdc_instance_resource_type_t resource_type, + rdc_resource_profile_t* profile) { + if (!p_rdc_handle || !profile) { + return RDC_ST_INVALID_HANDLER; + } + return static_cast(p_rdc_handle) + ->rdc_instance_profile_get(entity_index, resource_type, profile); } const char * get_rocm_path(const char * search_string) { @@ -573,4 +591,3 @@ const char * get_rocm_path(const char * search_string) { return rocm_path.c_str(); } - diff --git a/projects/rdc/rdc_libs/bootstrap/src/RdcEntityCodec.cc b/projects/rdc/rdc_libs/bootstrap/src/RdcEntityCodec.cc new file mode 100644 index 0000000000..965cec1ccd --- /dev/null +++ b/projects/rdc/rdc_libs/bootstrap/src/RdcEntityCodec.cc @@ -0,0 +1,111 @@ +/* +Copyright (c) 2025 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include + +#include +#include +#include + +#include "common/rdc_utils.h" + +rdc_entity_info_t rdc_get_info_from_entity_index(uint32_t entity_index) { + rdc_entity_info_t info; + info.device_type = + (rdc_device_type_t)((entity_index >> RDC_ENTITY_TYPE_SHIFT) & RDC_ENTITY_TYPE_MASK); + info.entity_role = + (rdc_device_role_t)((entity_index >> RDC_ENTITY_ROLE_SHIFT) & RDC_ENTITY_ROLE_MASK); + info.instance_index = (entity_index >> RDC_ENTITY_INSTANCE_SHIFT) & RDC_ENTITY_INSTANCE_MASK; + info.device_index = (entity_index >> RDC_ENTITY_DEVICE_SHIFT) & RDC_ENTITY_DEVICE_MASK; + return info; +} + +uint32_t rdc_get_entity_index_from_info(rdc_entity_info_t info) { + uint32_t entity_index = 0; + entity_index |= ((info.device_type & RDC_ENTITY_TYPE_MASK) << RDC_ENTITY_TYPE_SHIFT); + entity_index |= ((info.entity_role & RDC_ENTITY_ROLE_MASK) << RDC_ENTITY_ROLE_SHIFT); + entity_index |= ((info.instance_index & RDC_ENTITY_INSTANCE_MASK) << RDC_ENTITY_INSTANCE_SHIFT); + entity_index |= ((info.device_index & RDC_ENTITY_DEVICE_MASK) << RDC_ENTITY_DEVICE_SHIFT); + return entity_index; +} + +bool rdc_is_partition_string(const char* s) { + if (!s || s[0] == '\0') { + return false; + } + + if (s[0] != 'g') { + return false; + } + + std::string str(s); + size_t dotPos = str.find('.'); + if (dotPos == std::string::npos) return false; + + if (dotPos <= 1 || dotPos >= str.size() - 1) return false; + + std::string gpuPart = str.substr(1, dotPos - 1); + std::string partitionPart = str.substr(dotPos + 1); + + if (!std::all_of(gpuPart.begin(), gpuPart.end(), ::isdigit) || + !std::all_of(partitionPart.begin(), partitionPart.end(), ::isdigit)) + return false; + + int gpuIndex = std::stoi(gpuPart); + int partitionIndex = std::stoi(partitionPart); + + if (gpuIndex < 0 || gpuIndex >= RDC_MAX_NUM_DEVICES) return false; + if (partitionIndex < 0 || partitionIndex >= RDC_MAX_NUM_PARTITIONS) return false; + + return true; +} + +bool rdc_parse_partition_string(const char* s, uint32_t* physicalGpu, uint32_t* partition) { + if (!s) { + return false; + } + + if (!rdc_is_partition_string(s)) { + return false; + } + + std::string str(s); + + std::string rest = str.substr(1); + size_t pos = rest.find('.'); + + if (pos == std::string::npos) return false; + + std::string gpuStr = rest.substr(0, pos); + std::string partStr = rest.substr(pos + 1); + + // Ensure both parts are a number + if (!(!gpuStr.empty() && std::all_of(gpuStr.begin(), gpuStr.end(), ::isdigit)) || + !(!partStr.empty() && std::all_of(partStr.begin(), partStr.end(), ::isdigit))) { + return false; + } + + *physicalGpu = std::stoi(gpuStr); + *partition = std::stoi(partStr); + return true; +} diff --git a/projects/rdc/rdc_libs/rdc/CMakeLists.txt b/projects/rdc/rdc_libs/rdc/CMakeLists.txt index 2210950c7d..920e683ad2 100644 --- a/projects/rdc/rdc_libs/rdc/CMakeLists.txt +++ b/projects/rdc/rdc_libs/rdc/CMakeLists.txt @@ -28,6 +28,7 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/RdcConfigSettingsImpl.cc" "${SRC_DIR}/RdcTelemetryModule.cc" "${SRC_DIR}/RdcWatchTableImpl.cc" + "${SRC_DIR}/RdcPartitionImpl.cc" "${SRC_DIR}/SmiUtils.cc") # TODO: remove all headers? Will just dir be ok after install? @@ -60,6 +61,7 @@ set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${INC_DIR}/impl/RdcSmiLib.h" "${INC_DIR}/impl/RdcTelemetryModule.h" "${INC_DIR}/impl/RdcWatchTableImpl.h" + "${INC_DIR}/impl/RdcPartitionImpl.h" "${INC_DIR}/impl/SmiUtils.h") message("RDC_LIB_INC_LIST=${RDC_LIB_INC_LIST}") diff --git a/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc b/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc index 559ad161f5..c9c9068ac1 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc @@ -36,6 +36,7 @@ THE SOFTWARE. #include "rdc_lib/impl/RdcMetricsUpdaterImpl.h" #include "rdc_lib/impl/RdcModuleMgrImpl.h" #include "rdc_lib/impl/RdcNotificationImpl.h" +#include "rdc_lib/impl/RdcPartitionImpl.h" #include "rdc_lib/impl/RdcPolicyImpl.h" #include "rdc_lib/impl/RdcTopologyLinkImpl.h" #include "rdc_lib/impl/RdcWatchTableImpl.h" @@ -76,7 +77,8 @@ namespace rdc { const uint32_t METIC_UPDATE_FREQUENCY = 1000; // 1000 microseconds by default RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode) - : group_settings_(new RdcGroupSettingsImpl()), + : partition_(new RdcPartitionImpl()), + group_settings_(new RdcGroupSettingsImpl(partition_)), cache_mgr_(new RdcCacheManagerImpl()), metric_fetcher_(new RdcMetricFetcherImpl()), rdc_module_mgr_(new RdcModuleMgrImpl(metric_fetcher_)), @@ -261,9 +263,14 @@ rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_add(rdc_gpu_group_t group_id, uin if (status != RDC_ST_OK) { return status; } + + rdc_entity_info_t info = rdc_get_info_from_entity_index(gpu_index); + + uint32_t physical_gpu = info.device_index; + bool is_gpu_exist = false; for (uint32_t i = 0; i < count; i++) { - if (gpu_index_list[i] == gpu_index) { + if (gpu_index_list[i] == physical_gpu) { is_gpu_exist = true; break; } @@ -527,5 +534,14 @@ rdc_status_t RdcEmbeddedHandler::rdc_config_clear(rdc_gpu_group_t group_id) { return config_handler_->rdc_config_clear(group_id); } +rdc_status_t RdcEmbeddedHandler::rdc_get_num_partition(uint32_t index, uint16_t* num_partition) { + return partition_->rdc_get_num_partition_impl(index, num_partition); +} + +rdc_status_t RdcEmbeddedHandler::rdc_instance_profile_get( + uint32_t entity_index, rdc_instance_resource_type_t resource_type, + rdc_resource_profile_t* profile) { + return partition_->rdc_instance_profile_get_impl(entity_index, resource_type, profile); +} } // namespace rdc } // namespace amd diff --git a/projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc index 040580c2e7..42fc2b58fb 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcGroupSettingsImpl.cc @@ -23,13 +23,17 @@ THE SOFTWARE. #include +#include "amd_smi/amdsmi.h" #include "rdc_lib/RdcLogger.h" +#include "rdc_lib/impl/RdcPartitionImpl.h" +#include "rdc_lib/impl/SmiUtils.h" #include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { -RdcGroupSettingsImpl::RdcGroupSettingsImpl() { +RdcGroupSettingsImpl::RdcGroupSettingsImpl(const RdcPartitionPtr& partition) + : partition_(partition) { // Add the default job stats fields rdc_field_t job_fields[] = {RDC_FI_GPU_MEMORY_USAGE, RDC_FI_POWER_USAGE, RDC_FI_GPU_CLOCK, RDC_FI_GPU_UTIL, RDC_FI_PCIE_TX, RDC_FI_PCIE_RX, @@ -67,23 +71,50 @@ rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_destroy(rdc_gpu_group_t p_rdc_g rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_add(rdc_gpu_group_t groupId, uint32_t gpu_index) { std::lock_guard guard(group_mutex_); auto ite = gpu_group_.find(groupId); - if (ite != gpu_group_.end()) { - // Check whether the index already exists - for (uint32_t i = 0; i < ite->second.count; i++) { - if (ite->second.entity_ids[i] == gpu_index) { - RDC_LOG(RDC_INFO, "Fail to add " << gpu_index << " to GPU group " << groupId - << " as it is already exists"); + if (ite == gpu_group_.end()) { + return RDC_ST_NOT_FOUND; + } + + rdc_entity_info_t entity_info = rdc_get_info_from_entity_index(gpu_index); + + uint16_t num_partitions = 0; + rdc_status_t status = + partition_->rdc_get_num_partition_impl(entity_info.device_index, &num_partitions); + if (status != RDC_ST_OK) { + return status; + } + + if (num_partitions != UINT16_MAX && num_partitions > 1) { + if (entity_info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) { + if (entity_info.instance_index >= num_partitions) { + RDC_LOG(RDC_INFO, "Invalid partition instance: GPU " + << entity_info.device_index << " supports " << num_partitions + << " partitions, but instance index is " + << entity_info.instance_index); return RDC_ST_BAD_PARAMETER; } } - if (ite->second.count < RDC_GROUP_MAX_ENTITIES) { - ite->second.entity_ids[ite->second.count] = gpu_index; - ite->second.count++; - } else { - return RDC_ST_MAX_LIMIT; - } } else { - return RDC_ST_NOT_FOUND; + if (entity_info.entity_role != RDC_DEVICE_ROLE_PHYSICAL) { + RDC_LOG(RDC_INFO, "GPU " << entity_info.device_index + << " is not partitionable, but a partition instance was provided."); + return RDC_ST_BAD_PARAMETER; + } + } + + // Check whether the index already exists + for (uint32_t i = 0; i < ite->second.count; i++) { + if (ite->second.entity_ids[i] == gpu_index) { + RDC_LOG(RDC_INFO, "Fail to add " << gpu_index << " to GPU group " << groupId + << " as it is already exists"); + return RDC_ST_BAD_PARAMETER; + } + } + if (ite->second.count < RDC_GROUP_MAX_ENTITIES) { + ite->second.entity_ids[ite->second.count] = gpu_index; + ite->second.count++; + } else { + return RDC_ST_MAX_LIMIT; } return RDC_ST_OK; diff --git a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index 056b25e0b0..84080cf042 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -471,9 +471,18 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field amdsmi_processor_handle processor_handle = {}; - amdsmi_status_t ret = get_processor_handle_from_id(gpu_index, &processor_handle); + rdc_entity_info_t info = rdc_get_info_from_entity_index(gpu_index); + + amdsmi_status_t ret = get_processor_handle_from_id(info.device_index, &processor_handle); if (ret != AMDSMI_STATUS_SUCCESS) { - RDC_LOG(RDC_ERROR, "Failed to get processor handle for GPU " << gpu_index << " error: " << ret); + std::string info_str; + if (info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) { + info_str = + "g" + std::to_string(info.device_index) + "." + std::to_string(info.instance_index); + } else { + info_str = std::to_string(info.device_index); + } + RDC_LOG(RDC_ERROR, "Failed to get processor handle for GPU " << info_str << " error: " << ret); return Smi2RdcError(ret); } @@ -486,6 +495,138 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field value->field_id = field_id; value->status = AMDSMI_STATUS_NOT_SUPPORTED; + if (info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) { + uint16_t num_partitions = 0; + amdsmi_status_t st = get_num_partition(info.device_index, &num_partitions); + if (st != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "Failed to get partition info for GPU " << info.device_index); + return RDC_ST_UNKNOWN_ERROR; + } + + amdsmi_processor_handle processor_handle = {}; + amdsmi_status_t ret = get_processor_handle_from_id(gpu_index, &processor_handle); + if (ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "Cannot get processor handle for partition " << info.instance_index); + return Smi2RdcError(ret); + } + + amdsmi_gpu_metrics_t gpu_metrics = {}; + ret = amdsmi_get_gpu_metrics_info(processor_handle, &gpu_metrics); + if (ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "Failed to get GPU metrics info for partition " << info.instance_index); + return Smi2RdcError(ret); + } + + switch (field_id) { + case RDC_FI_GPU_CLOCK: { + const uint16_t* clock_array = gpu_metrics.current_gfxclks; + std::vector valid_clocks; + valid_clocks.reserve(8); + + for (uint32_t i = 0; i < 8; i++) { + uint16_t clk = clock_array[i]; + if (clk != 0 && clk != 0xFFFF) { + valid_clocks.push_back(clk); + } + } + + uint32_t vc = static_cast(valid_clocks.size()); + uint32_t pCount = static_cast(num_partitions); + uint32_t partIdx = info.instance_index; + + if (valid_clocks.empty() || vc < num_partitions) { + RDC_LOG(RDC_ERROR, "No valid clocks, or less than total partitions"); + return RDC_ST_NO_DATA; + } + + if (vc == num_partitions) { + value->value.l_int = clock_array[info.instance_index] * 1000000; + value->type = INTEGER; + value->status = RDC_ST_OK; + return RDC_ST_OK; + } + + uint32_t chunk_size = vc / pCount; + uint32_t start_idx = partIdx * chunk_size; + uint32_t end_idx = start_idx + chunk_size; + + // Average partition clocks + uint64_t sum = 0; + for (uint32_t i = start_idx; i < end_idx; i++) { + sum += valid_clocks[i]; + } + uint32_t count = end_idx - start_idx; + if (count == 0) { + return RDC_ST_NO_DATA; + } + uint64_t avg_clock = sum / count; + + value->value.l_int = avg_clock * 1000000; + value->type = INTEGER; + value->status = RDC_ST_OK; + return RDC_ST_OK; + } + + case RDC_FI_GPU_UTIL: { + uint32_t p = info.instance_index; + if (p >= AMDSMI_MAX_NUM_XCP) { + return RDC_ST_NO_DATA; + } + const amdsmi_gpu_xcp_metrics_t& xcp = gpu_metrics.xcp_stats[p]; + + uint64_t sum = 0; + uint32_t count = 0; + for (uint32_t i = 0; i < AMDSMI_MAX_NUM_XCC; i++) { + uint32_t busy = xcp.gfx_busy_inst[i]; + if (busy != UINT32_MAX) { + sum += busy; + count++; + } + } + if (count == 0) { + return RDC_ST_NO_DATA; + } + uint64_t avg_busy = sum / count; + value->value.l_int = avg_busy; + value->type = INTEGER; + value->status = RDC_ST_OK; + return RDC_ST_OK; + } + + case RDC_FI_GPU_MM_DEC_UTIL: { + uint32_t p = info.instance_index; + if (p >= AMDSMI_MAX_NUM_XCP) { + return RDC_ST_NO_DATA; + } + const amdsmi_gpu_xcp_metrics_t& xcp = gpu_metrics.xcp_stats[p]; + + uint64_t sum = 0; + uint32_t count = 0; + for (uint32_t i = 0; i < AMDSMI_MAX_NUM_VCN; i++) { + uint16_t vcn = xcp.vcn_busy[i]; + if (vcn != UINT16_MAX) { + sum += vcn; + count++; + } + } + if (count == 0) { + return RDC_ST_NO_DATA; + } + uint64_t avg_decode = sum / count; + value->value.l_int = avg_decode; + value->type = INTEGER; + value->status = RDC_ST_OK; + return RDC_ST_OK; + } + + default: + // All other fields => N/A for partition + RDC_LOG(RDC_DEBUG, "Partition " << gpu_index << ": Field " << field_id_string(field_id) + << " not supported => NO_DATA."); + return RDC_ST_NO_DATA; + } + } // end if partition + auto read_smi_counter = [&](void) { RdcFieldKey f_key(gpu_index, field_id); smi_data = get_smi_data(f_key); @@ -600,12 +741,11 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field break; } case RDC_FI_GPU_COUNT: { - uint32_t processor_count = 0; - // amdsmi is initialized in AMDSMI_INIT_AMD_GPUS mode -> returned sockets are GPUs - value->status = get_processor_count(processor_count); + uint32_t socket_count = 0; + value->status = amdsmi_get_socket_handles(&socket_count, nullptr); value->type = INTEGER; if (value->status == AMDSMI_STATUS_SUCCESS) { - value->value.l_int = static_cast(processor_count); + value->value.l_int = static_cast(socket_count); } } break; case RDC_FI_POWER_USAGE: { @@ -913,8 +1053,9 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field value->value.l_int = static_cast(pending_page_num); } } - } else + } else { value->status = Smi2RdcError(ret); + } break; } diff --git a/projects/rdc/rdc_libs/rdc/src/RdcPartitionImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcPartitionImpl.cc new file mode 100644 index 0000000000..32ac1b5337 --- /dev/null +++ b/projects/rdc/rdc_libs/rdc/src/RdcPartitionImpl.cc @@ -0,0 +1,117 @@ +/* +Copyright (c) 2025 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "rdc_lib/impl/RdcPartitionImpl.h" + +#include +#include + +#include + +#include "amd_smi/amdsmi.h" +#include "rdc/rdc.h" +#include "rdc_lib/impl/SmiUtils.h" + +namespace amd { +namespace rdc { + +rdc_status_t RdcPartitionImpl::rdc_instance_profile_get_impl( + uint32_t entity_index, rdc_instance_resource_type_t resource_type, + rdc_resource_profile_t* profile) { + if (profile == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + profile->partition_resource = 0; + profile->num_partitions_share_resource = 0; + + rdc_entity_info_t info = rdc_get_info_from_entity_index(entity_index); + + amdsmi_processor_handle proc_handle; + // Get processor handle of socket + amdsmi_status_t ret = get_processor_handle_from_id(info.device_index, &proc_handle); + if (ret != AMDSMI_STATUS_SUCCESS) { + return RDC_ST_UNKNOWN_ERROR; + } + + amdsmi_accelerator_partition_profile_config_t config; + memset(&config, 0, sizeof(config)); + ret = amdsmi_get_gpu_accelerator_partition_profile_config(proc_handle, &config); + if (ret == AMDSMI_STATUS_NOT_SUPPORTED) { + return RDC_ST_OK; + } else if (ret != AMDSMI_STATUS_SUCCESS) { + return RDC_ST_UNKNOWN_ERROR; + } + + amdsmi_accelerator_partition_profile_t active_profile; + memset(&active_profile, 0, sizeof(active_profile)); + uint32_t num = 0; // This is unused + ret = amdsmi_get_gpu_accelerator_partition_profile(proc_handle, &active_profile, &num); + if (ret != AMDSMI_STATUS_SUCCESS) { + return RDC_ST_UNKNOWN_ERROR; + } + + // If physical device, use profile 0 to get all XCC's/Decoders + uint32_t lookup_id = + (info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) ? active_profile.profile_index : 0; + + // Map rdc resource type to smi + amdsmi_accelerator_partition_resource_type_t smi_resource; + switch (resource_type) { + case RDC_ACCELERATOR_XCC: + smi_resource = AMDSMI_ACCELERATOR_XCC; + break; + case RDC_ACCELERATOR_DECODER: + smi_resource = AMDSMI_ACCELERATOR_DECODER; + break; + default: + return RDC_ST_NOT_SUPPORTED; + } + + bool found = false; + uint32_t total_resource = 0; + uint32_t resource_share = 0; + for (uint32_t i = 0; i < AMDSMI_MAX_CP_PROFILE_RESOURCES; i++) { + const auto& res = config.resource_profiles[i]; + if (res.profile_index == lookup_id && res.resource_type == smi_resource) { + total_resource = res.partition_resource; + resource_share = res.num_partitions_share_resource; + found = true; + break; + } + } + if (!found) { + return RDC_ST_UNKNOWN_ERROR; + } + profile->partition_resource = total_resource; + profile->num_partitions_share_resource = resource_share; + + return RDC_ST_OK; +} + +rdc_status_t RdcPartitionImpl::rdc_get_num_partition_impl(uint32_t index, uint16_t* num_partition) { + if (get_num_partition(index, num_partition) != AMDSMI_STATUS_SUCCESS) { + return RDC_ST_UNKNOWN_ERROR; + } + return RDC_ST_OK; +} + +} // namespace rdc +} // namespace amd diff --git a/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc index 8c91d7ba05..fca624e6bc 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc @@ -34,8 +34,8 @@ THE SOFTWARE. #include "rdc/rdc.h" #include "rdc_lib/RdcLogger.h" #include "rdc_lib/impl/RdcMetricFetcherImpl.h" -#include "rdc_lib/rdc_common.h" #include "rdc_lib/impl/SmiUtils.h" +#include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { @@ -209,6 +209,42 @@ rdc_status_t RdcWatchTableImpl::rdc_field_watch(rdc_gpu_group_t group_id, return result; } + // Check for rocprof fields in partitions + rdc_group_info_t ginfo; + result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo); + if (result != RDC_ST_OK) { + return result; + } + bool groupHasPartition = false; + for (unsigned int i = 0; i < ginfo.count; i++) { + uint32_t entityId = ginfo.entity_ids[i]; + rdc_entity_info_t info = rdc_get_info_from_entity_index(entityId); + if (info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) { + groupHasPartition = true; + break; + } + } + + rdc_field_group_info_t field_info; + result = group_settings_->rdc_group_field_get_info(field_group_id, &field_info); + if (result != RDC_ST_OK) { + return result; + } + bool groupHasRocprof = false; + if (result == RDC_ST_OK) { + for (unsigned int i = 0; i < field_info.count; i++) { + rdc_field_t fid = field_info.field_ids[i]; + if (fid >= 800 && fid < 900) { // Rocprof fields in the 800's + groupHasRocprof = true; + break; + } + } + } + + if (groupHasPartition && groupHasRocprof) { + return RDC_ST_NOT_SUPPORTED; + } + // See if any of the fields are notification fields, and // set them up, if so. result = notifications_->set_listen_events(fields_in_watch); @@ -381,30 +417,30 @@ rdc_status_t RdcWatchTableImpl::create_health_field_group(unsigned int component // set filed ids std::vector field_ids{}; if (components & RDC_HEALTH_WATCH_PCIE) { - field_ids.push_back(RDC_HEALTH_PCIE_REPLAY_COUNT); + field_ids.push_back(RDC_HEALTH_PCIE_REPLAY_COUNT); } if (components & RDC_HEALTH_WATCH_XGMI) { - field_ids.push_back(RDC_HEALTH_XGMI_ERROR); + field_ids.push_back(RDC_HEALTH_XGMI_ERROR); } if (components & RDC_HEALTH_WATCH_MEM) { - field_ids.push_back(RDC_FI_ECC_UNCORRECT_TOTAL); - field_ids.push_back(RDC_HEALTH_RETIRED_PAGE_NUM); - field_ids.push_back(RDC_HEALTH_PENDING_PAGE_NUM); - field_ids.push_back(RDC_HEALTH_RETIRED_PAGE_LIMIT); + field_ids.push_back(RDC_FI_ECC_UNCORRECT_TOTAL); + field_ids.push_back(RDC_HEALTH_RETIRED_PAGE_NUM); + field_ids.push_back(RDC_HEALTH_PENDING_PAGE_NUM); + field_ids.push_back(RDC_HEALTH_RETIRED_PAGE_LIMIT); } if (components & RDC_HEALTH_WATCH_EEPROM) { - field_ids.push_back(RDC_HEALTH_EEPROM_CONFIG_VALID); + field_ids.push_back(RDC_HEALTH_EEPROM_CONFIG_VALID); } if (components & RDC_HEALTH_WATCH_THERMAL) { - field_ids.push_back(RDC_HEALTH_THERMAL_THROTTLE_TIME); + field_ids.push_back(RDC_HEALTH_THERMAL_THROTTLE_TIME); } if (components & RDC_HEALTH_WATCH_POWER) { - field_ids.push_back(RDC_HEALTH_POWER_THROTTLE_TIME); + field_ids.push_back(RDC_HEALTH_POWER_THROTTLE_TIME); } if (0 == field_ids.size()) { @@ -417,8 +453,7 @@ rdc_status_t RdcWatchTableImpl::create_health_field_group(unsigned int component field_group_name.c_str(), field_group_id); } -rdc_status_t RdcWatchTableImpl::rdc_health_set(rdc_gpu_group_t group_id, - unsigned int components) { +rdc_status_t RdcWatchTableImpl::rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) { // remove old health for same group_id rdc_health_clear(group_id); @@ -447,13 +482,11 @@ rdc_status_t RdcWatchTableImpl::rdc_health_set(rdc_gpu_group_t group_id, // get initial values rdc_field_value value; result = metric_fetcher_->fetch_smi_field(fields->first, fields->second, &value); - if (result != RDC_ST_OK) - break; + if (result != RDC_ST_OK) break; // set initial values to cache result = cache_mgr_->rdc_health_set(group_id, fields->first, value); - if (result != RDC_ST_OK) - break; + if (result != RDC_ST_OK) break; } // Start to watch the fields and update fields per 1 second. @@ -461,10 +494,8 @@ rdc_status_t RdcWatchTableImpl::rdc_health_set(rdc_gpu_group_t group_id, return result; } -rdc_status_t RdcWatchTableImpl::rdc_health_get(rdc_gpu_group_t group_id, - unsigned int *components) { - if (nullptr == components) - return RDC_ST_BAD_PARAMETER; +rdc_status_t RdcWatchTableImpl::rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components) { + if (nullptr == components) return RDC_ST_BAD_PARAMETER; std::lock_guard guard(watch_mutex_); auto table_iter = health_watch_table_.find(group_id); @@ -478,23 +509,19 @@ rdc_status_t RdcWatchTableImpl::rdc_health_get(rdc_gpu_group_t group_id, return RDC_ST_OK; } -bool RdcWatchTableImpl::add_health_incident(uint32_t gpu_index, - rdc_health_system_t component, - rdc_health_result_t health, - uint32_t err_code, - std::string err_msg, - rdc_health_incidents_t* incident, +bool RdcWatchTableImpl::add_health_incident(uint32_t gpu_index, rdc_health_system_t component, + rdc_health_result_t health, uint32_t err_code, + std::string err_msg, rdc_health_incidents_t* incident, rdc_health_response_t* response) { bool result = false; - incident->gpu_index = gpu_index; - incident->component = component; - incident->health = health; + incident->gpu_index = gpu_index; + incident->component = component; + incident->health = health; incident->error.code = err_code; strncpy_with_null(incident->error.msg, err_msg.c_str(), MAX_HEALTH_MSG_LENGTH); - if (incident->health > response->overall_health) - response->overall_health = incident->health; + if (incident->health > response->overall_health) response->overall_health = incident->health; response->incidents_count++; if (response->incidents_count >= HEALTH_MAX_ERROR_ITEMS) { RDC_LOG(RDC_INFO, "Health incidents are full!"); @@ -504,24 +531,20 @@ bool RdcWatchTableImpl::add_health_incident(uint32_t gpu_index, return (result); } -rdc_status_t RdcWatchTableImpl::get_start_end_values(rdc_gpu_group_t group_id, - uint32_t gpu_index, - rdc_field_t field, - uint64_t start_timestamp, - rdc_field_value *start_value, - rdc_field_value *end_value) { - if ((nullptr == start_value) && (nullptr == end_value)) - return RDC_ST_BAD_PARAMETER; +rdc_status_t RdcWatchTableImpl::get_start_end_values(rdc_gpu_group_t group_id, uint32_t gpu_index, + rdc_field_t field, uint64_t start_timestamp, + rdc_field_value* start_value, + rdc_field_value* end_value) { + if ((nullptr == start_value) && (nullptr == end_value)) return RDC_ST_BAD_PARAMETER; rdc_status_t result = RDC_ST_OK; if (nullptr != start_value) { - //get the values of the field at the start_timestamp/end_timestampe - result = cache_mgr_->rdc_health_get_values(group_id, - gpu_index, field, - start_timestamp, 0, - start_value, nullptr); + // get the values of the field at the start_timestamp/end_timestampe + result = cache_mgr_->rdc_health_get_values(group_id, gpu_index, field, start_timestamp, 0, + start_value, nullptr); if (result != RDC_ST_OK) { - RDC_LOG(RDC_ERROR, "Error get gpu: " << gpu_index << " field: " << field << " history data. Return: " << result); + RDC_LOG(RDC_ERROR, "Error get gpu: " << gpu_index << " field: " << field + << " history data. Return: " << result); return result; } } @@ -529,30 +552,25 @@ rdc_status_t RdcWatchTableImpl::get_start_end_values(rdc_gpu_group_t group_id, // get end values result = metric_fetcher_->fetch_smi_field(gpu_index, field, end_value); if (result != RDC_ST_OK) - RDC_LOG(RDC_ERROR, "Error get gpu: " << gpu_index << " field: " << field << " current data. Return: " << result); + RDC_LOG(RDC_ERROR, "Error get gpu: " << gpu_index << " field: " << field + << " current data. Return: " << result); return result; } -rdc_status_t RdcWatchTableImpl::pcie_check(rdc_gpu_group_t group_id, - uint32_t gpu_index, +rdc_status_t RdcWatchTableImpl::pcie_check(rdc_gpu_group_t group_id, uint32_t gpu_index, rdc_health_response_t* response) { - //get field start/end values + // get field start/end values rdc_field_value start = {}, end = {}; uint64_t start_timestamp = static_cast(time(nullptr) - 60) * 1000; - //get the history data last 1 minute - rdc_status_t result = get_start_end_values(group_id, - gpu_index, - RDC_HEALTH_PCIE_REPLAY_COUNT, - start_timestamp, - &start, - &end); - if (result != RDC_ST_OK) - return result; + // get the history data last 1 minute + rdc_status_t result = get_start_end_values(group_id, gpu_index, RDC_HEALTH_PCIE_REPLAY_COUNT, + start_timestamp, &start, &end); + if (result != RDC_ST_OK) return result; uint64_t pcie_replay_count = end.value.l_int - start.value.l_int; if (pcie_replay_count > PCIE_MAX_REPLAYS_PERMIN) { - rdc_health_incidents_t *incident = &response->incidents[response->incidents_count]; + rdc_health_incidents_t* incident = &response->incidents[response->incidents_count]; std::string err_msg = "Detected "; err_msg += std::to_string(pcie_replay_count); @@ -560,37 +578,26 @@ rdc_status_t RdcWatchTableImpl::pcie_check(rdc_gpu_group_t group_id, err_msg += std::to_string(PCIE_MAX_REPLAYS_PERMIN); err_msg += "."; - //add incident - if (add_health_incident(gpu_index, - RDC_HEALTH_WATCH_PCIE, - RDC_HEALTH_RESULT_WARN, - RDC_FR_PCI_REPLAY_RATE, - err_msg, - incident, - response)) + // add incident + if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_PCIE, RDC_HEALTH_RESULT_WARN, + RDC_FR_PCI_REPLAY_RATE, err_msg, incident, response)) return RDC_ST_MAX_LIMIT; } return RDC_ST_OK; } -rdc_status_t RdcWatchTableImpl::xgmi_check(rdc_gpu_group_t group_id, - uint32_t gpu_index, +rdc_status_t RdcWatchTableImpl::xgmi_check(rdc_gpu_group_t group_id, uint32_t gpu_index, rdc_health_response_t* response) { - //get field start/end values + // get field start/end values rdc_field_value end = {}; - rdc_status_t result = get_start_end_values(group_id, - gpu_index, - RDC_HEALTH_XGMI_ERROR, - 0, - nullptr, - &end); - if (result != RDC_ST_OK) - return result; + rdc_status_t result = + get_start_end_values(group_id, gpu_index, RDC_HEALTH_XGMI_ERROR, 0, nullptr, &end); + if (result != RDC_ST_OK) return result; amdsmi_xgmi_status_t status = static_cast(end.value.l_int); if (AMDSMI_XGMI_STATUS_NO_ERRORS != status) { - rdc_health_incidents_t *incident = &response->incidents[response->incidents_count]; + rdc_health_incidents_t* incident = &response->incidents[response->incidents_count]; uint32_t err_code; std::string err_msg = "Detected "; @@ -603,106 +610,68 @@ rdc_status_t RdcWatchTableImpl::xgmi_check(rdc_gpu_group_t group_id, } err_msg += "."; - //add incident - if (add_health_incident(gpu_index, - RDC_HEALTH_WATCH_XGMI, - RDC_HEALTH_RESULT_FAIL, - err_code, - err_msg, - incident, - response)) + // add incident + if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_XGMI, RDC_HEALTH_RESULT_FAIL, err_code, + err_msg, incident, response)) return RDC_ST_MAX_LIMIT; } return RDC_ST_OK; } -rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id, - uint32_t gpu_index, +rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id, uint32_t gpu_index, rdc_health_response_t* response) { - //get field start/end values - rdc_field_value start= {}, end = {}; - rdc_status_t result = get_start_end_values(group_id, - gpu_index, - RDC_FI_ECC_UNCORRECT_TOTAL, - 0, - nullptr, - &end); - if (result != RDC_ST_OK) - return result; + // get field start/end values + rdc_field_value start = {}, end = {}; + rdc_status_t result = + get_start_end_values(group_id, gpu_index, RDC_FI_ECC_UNCORRECT_TOTAL, 0, nullptr, &end); + if (result != RDC_ST_OK) return result; uint64_t ecc_uncorrectable_count = 0; ecc_uncorrectable_count = end.value.l_int; if (ecc_uncorrectable_count > 0) { - rdc_health_incidents_t *incident = &response->incidents[response->incidents_count]; + rdc_health_incidents_t* incident = &response->incidents[response->incidents_count]; std::string err_msg = "Detected "; err_msg += std::to_string(ecc_uncorrectable_count); err_msg += " uncorrectable ECC error(s) since last GPU reset."; - //add incident - if (add_health_incident(gpu_index, - RDC_HEALTH_WATCH_MEM, - RDC_HEALTH_RESULT_FAIL, - RDC_FR_ECC_UNCORRECTABLE_DETECTED, - err_msg, - incident, - response)) + // add incident + if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_MEM, RDC_HEALTH_RESULT_FAIL, + RDC_FR_ECC_UNCORRECTABLE_DETECTED, err_msg, incident, response)) return RDC_ST_MAX_LIMIT; } - result = get_start_end_values(group_id, - gpu_index, - RDC_HEALTH_PENDING_PAGE_NUM, - 0, - nullptr, - &end); - if (result != RDC_ST_OK) - return result; + result = get_start_end_values(group_id, gpu_index, RDC_HEALTH_PENDING_PAGE_NUM, 0, nullptr, &end); + if (result != RDC_ST_OK) return result; uint64_t num_pages = end.value.l_int; if (num_pages > 0) { - rdc_health_incidents_t *incident = &response->incidents[response->incidents_count]; + rdc_health_incidents_t* incident = &response->incidents[response->incidents_count]; std::string err_msg = "Detected "; err_msg += std::to_string(num_pages); err_msg += " pending retired page(s)."; - //add incident - if (add_health_incident(gpu_index, - RDC_HEALTH_WATCH_MEM, - RDC_HEALTH_RESULT_WARN, - RDC_FR_PENDING_PAGE_RETIREMENTS, - err_msg, - incident, - response)) + // add incident + if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_MEM, RDC_HEALTH_RESULT_WARN, + RDC_FR_PENDING_PAGE_RETIREMENTS, err_msg, incident, response)) return RDC_ST_MAX_LIMIT; } - //get retired page number - result = get_start_end_values(group_id, - gpu_index, - RDC_HEALTH_RETIRED_PAGE_NUM, - 0, - nullptr, - &end); - if (result != RDC_ST_OK) - return result; + // get retired page number + result = get_start_end_values(group_id, gpu_index, RDC_HEALTH_RETIRED_PAGE_NUM, 0, nullptr, &end); + if (result != RDC_ST_OK) return result; uint64_t retired_page = end.value.l_int; - //get retired page threshold - result = get_start_end_values(group_id, - gpu_index, - RDC_HEALTH_RETIRED_PAGE_LIMIT, - 0, - nullptr, - &end); - if (result != RDC_ST_OK) - return result; + // get retired page threshold + result = + get_start_end_values(group_id, gpu_index, RDC_HEALTH_RETIRED_PAGE_LIMIT, 0, nullptr, &end); + if (result != RDC_ST_OK) return result; uint32_t retired_page_threshold = end.value.l_int; if (retired_page > retired_page_threshold) { - rdc_health_incidents_t *incident = &response->incidents[response->incidents_count]; + rdc_health_incidents_t* incident = &response->incidents[response->incidents_count]; std::string err_msg = "Detected "; err_msg += std::to_string(retired_page); @@ -710,14 +679,9 @@ rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id, err_msg += std::to_string(retired_page_threshold); err_msg += "."; - //add incident - if (add_health_incident(gpu_index, - RDC_HEALTH_WATCH_MEM, - RDC_HEALTH_RESULT_FAIL, - RDC_FR_RETIRED_PAGES_LIMIT, - err_msg, - incident, - response)) + // add incident + if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_MEM, RDC_HEALTH_RESULT_FAIL, + RDC_FR_RETIRED_PAGES_LIMIT, err_msg, incident, response)) return RDC_ST_MAX_LIMIT; return RDC_ST_OK; @@ -725,31 +689,22 @@ rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id, if (retired_page > 0) { uint64_t start_timestamp = static_cast(time(nullptr) - 604800) * 1000; - //get retired page number last 1 week - result = get_start_end_values(group_id, - gpu_index, - RDC_HEALTH_RETIRED_PAGE_NUM, - start_timestamp, - &start, - &end); - if (result != RDC_ST_OK) - return result; + // get retired page number last 1 week + result = get_start_end_values(group_id, gpu_index, RDC_HEALTH_RETIRED_PAGE_NUM, start_timestamp, + &start, &end); + if (result != RDC_ST_OK) return result; retired_page = end.value.l_int - start.value.l_int; if (retired_page > 1) { - rdc_health_incidents_t *incident = &response->incidents[response->incidents_count]; + rdc_health_incidents_t* incident = &response->incidents[response->incidents_count]; std::string err_msg = "Detected "; err_msg += std::to_string(retired_page); err_msg += " retired pages more than one in the last week."; - //add incident - if (add_health_incident(gpu_index, - RDC_HEALTH_WATCH_MEM, - RDC_HEALTH_RESULT_FAIL, - RDC_FR_RETIRED_PAGES_UNCORRECTABLE_LIMIT, - err_msg, - incident, + // add incident + if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_MEM, RDC_HEALTH_RESULT_FAIL, + RDC_FR_RETIRED_PAGES_UNCORRECTABLE_LIMIT, err_msg, incident, response)) return RDC_ST_MAX_LIMIT; } @@ -758,194 +713,150 @@ rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id, return RDC_ST_OK; } -rdc_status_t RdcWatchTableImpl::eeprom_check(rdc_gpu_group_t group_id, - uint32_t gpu_index, +rdc_status_t RdcWatchTableImpl::eeprom_check(rdc_gpu_group_t group_id, uint32_t gpu_index, rdc_health_response_t* response) { rdc_field_value end = {}; - rdc_status_t result = get_start_end_values(group_id, - gpu_index, - RDC_FI_ECC_UNCORRECT_TOTAL, - 0, - nullptr, - &end); - if (result != RDC_ST_OK && result != RDC_ST_CORRUPTED_EEPROM) - return result; + rdc_status_t result = + get_start_end_values(group_id, gpu_index, RDC_FI_ECC_UNCORRECT_TOTAL, 0, nullptr, &end); + if (result != RDC_ST_OK && result != RDC_ST_CORRUPTED_EEPROM) return result; if (result == RDC_ST_CORRUPTED_EEPROM) { - rdc_health_incidents_t *incident = &response->incidents[response->incidents_count]; + rdc_health_incidents_t* incident = &response->incidents[response->incidents_count]; std::string err_msg = "Detected a corrupt EEPROM since last GPU reset."; - //add incident - if (add_health_incident(gpu_index, - RDC_HEALTH_WATCH_EEPROM, - RDC_HEALTH_RESULT_WARN, - RDC_FR_CORRUPT_EEPROM, - err_msg, - incident, - response)) + // add incident + if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_EEPROM, RDC_HEALTH_RESULT_WARN, + RDC_FR_CORRUPT_EEPROM, err_msg, incident, response)) return RDC_ST_MAX_LIMIT; } return RDC_ST_OK; } -rdc_status_t RdcWatchTableImpl::thermal_check(rdc_gpu_group_t group_id, - uint32_t gpu_index, +rdc_status_t RdcWatchTableImpl::thermal_check(rdc_gpu_group_t group_id, uint32_t gpu_index, rdc_health_response_t* response) { - //get field start/end values + // get field start/end values rdc_field_value start = {}, end = {}; uint64_t start_timestamp = static_cast(time(nullptr) - 60) * 1000; - //get the history data last 1 minute - rdc_status_t result = get_start_end_values(group_id, - gpu_index, - RDC_HEALTH_THERMAL_THROTTLE_TIME, - start_timestamp, - &start, - &end); - if (result != RDC_ST_OK) - return result; + // get the history data last 1 minute + rdc_status_t result = get_start_end_values(group_id, gpu_index, RDC_HEALTH_THERMAL_THROTTLE_TIME, + start_timestamp, &start, &end); + if (result != RDC_ST_OK) return result; uint64_t acc_socket_thrm = end.value.l_int - start.value.l_int; if (0 < acc_socket_thrm) { - rdc_health_incidents_t *incident = &response->incidents[response->incidents_count]; + rdc_health_incidents_t* incident = &response->incidents[response->incidents_count]; std::string err_msg = "Detected "; err_msg += std::to_string(acc_socket_thrm); err_msg += " clock throttling due to thermal violation in the last minute."; - //add incident - if (add_health_incident(gpu_index, - RDC_HEALTH_WATCH_THERMAL, - RDC_HEALTH_RESULT_WARN, - RDC_FR_CLOCKS_THROTTLE_THERMAL, - err_msg, - incident, - response)) + // add incident + if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_THERMAL, RDC_HEALTH_RESULT_WARN, + RDC_FR_CLOCKS_THROTTLE_THERMAL, err_msg, incident, response)) return RDC_ST_MAX_LIMIT; } return RDC_ST_OK; } -rdc_status_t RdcWatchTableImpl::power_check(rdc_gpu_group_t group_id, - uint32_t gpu_index, +rdc_status_t RdcWatchTableImpl::power_check(rdc_gpu_group_t group_id, uint32_t gpu_index, rdc_health_response_t* response) { - //get field start/end values + // get field start/end values rdc_field_value start = {}, end = {}; uint64_t start_timestamp = static_cast(time(nullptr) - 60) * 1000; - //get the history data last 1 minute - rdc_status_t result = get_start_end_values(group_id, - gpu_index, - RDC_HEALTH_POWER_THROTTLE_TIME, - start_timestamp, - &start, - &end); - if (result != RDC_ST_OK) - return result; + // get the history data last 1 minute + rdc_status_t result = get_start_end_values(group_id, gpu_index, RDC_HEALTH_POWER_THROTTLE_TIME, + start_timestamp, &start, &end); + if (result != RDC_ST_OK) return result; uint64_t acc_ppt_pwr = end.value.l_int - start.value.l_int; if (0 < acc_ppt_pwr) { - rdc_health_incidents_t *incident = &response->incidents[response->incidents_count]; + rdc_health_incidents_t* incident = &response->incidents[response->incidents_count]; std::string err_msg = "Detected "; err_msg += std::to_string(acc_ppt_pwr); err_msg += " Detected clock throttling due to power violation in the last minute."; - //add incident - if (add_health_incident(gpu_index, - RDC_HEALTH_WATCH_POWER, - RDC_HEALTH_RESULT_WARN, - RDC_FR_CLOCKS_THROTTLE_POWER, - err_msg, - incident, - response)) + // add incident + if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_POWER, RDC_HEALTH_RESULT_WARN, + RDC_FR_CLOCKS_THROTTLE_POWER, err_msg, incident, response)) return RDC_ST_MAX_LIMIT; } return RDC_ST_OK; } rdc_status_t RdcWatchTableImpl::rdc_health_check(rdc_gpu_group_t group_id, - rdc_health_response_t *response) { - if (nullptr == response) - return RDC_ST_BAD_PARAMETER; + rdc_health_response_t* response) { + if (nullptr == response) return RDC_ST_BAD_PARAMETER; unsigned int components = 0; std::vector fields_in_watch; do { //< lock guard for thread safe std::lock_guard guard(watch_mutex_); auto health = health_watch_table_.find(group_id); - if (health == health_watch_table_.end()) - return RDC_ST_NOT_FOUND; + if (health == health_watch_table_.end()) return RDC_ST_NOT_FOUND; components = health->second.components; fields_in_watch = health->second.fields; } while (0); rdc_group_info_t ginfo; rdc_status_t result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo); - if (result != RDC_ST_OK) - return result; + if (result != RDC_ST_OK) return result; for (auto fields = fields_in_watch.begin(); fields != fields_in_watch.end(); fields++) { // get current values rdc_field_value value; result = metric_fetcher_->fetch_smi_field(fields->first, fields->second, &value); - if (result != RDC_ST_OK) - break; + if (result != RDC_ST_OK) break; // set current values to cache result = cache_mgr_->rdc_update_health_stats(group_id, fields->first, value); - if (result != RDC_ST_OK) - break; + if (result != RDC_ST_OK) break; } - //init response + // init response response->overall_health = RDC_HEALTH_RESULT_PASS; response->incidents_count = 0; for (uint32_t gindex = 0; gindex < ginfo.count; gindex++) { - //PCIe + // PCIe if (components & RDC_HEALTH_WATCH_PCIE) { result = pcie_check(group_id, ginfo.entity_ids[gindex], response); - if (result == RDC_ST_MAX_LIMIT) - return result; + if (result == RDC_ST_MAX_LIMIT) return result; } - //XGMI + // XGMI if (components & RDC_HEALTH_WATCH_XGMI) { result = xgmi_check(group_id, ginfo.entity_ids[gindex], response); - if (result == RDC_ST_MAX_LIMIT) - return result; + if (result == RDC_ST_MAX_LIMIT) return result; } - //Memory + // Memory if (components & RDC_HEALTH_WATCH_MEM) { result = memory_check(group_id, ginfo.entity_ids[gindex], response); - if (result == RDC_ST_MAX_LIMIT) - return result; + if (result == RDC_ST_MAX_LIMIT) return result; } - //EEPROM + // EEPROM if (components & RDC_HEALTH_WATCH_EEPROM) { result = eeprom_check(group_id, ginfo.entity_ids[gindex], response); - if (result == RDC_ST_MAX_LIMIT) - return result; + if (result == RDC_ST_MAX_LIMIT) return result; } - //Thermal + // Thermal if (components & RDC_HEALTH_WATCH_THERMAL) { result = thermal_check(group_id, ginfo.entity_ids[gindex], response); - if (result == RDC_ST_MAX_LIMIT) - return result; + if (result == RDC_ST_MAX_LIMIT) return result; } - //Power + // Power if (components & RDC_HEALTH_WATCH_POWER) { result = power_check(group_id, ginfo.entity_ids[gindex], response); - if (result == RDC_ST_MAX_LIMIT) - return result; + if (result == RDC_ST_MAX_LIMIT) return result; } - } //end of for gindex + } // end of for gindex return RDC_ST_OK; } @@ -953,7 +864,7 @@ rdc_status_t RdcWatchTableImpl::rdc_health_check(rdc_gpu_group_t group_id, rdc_status_t RdcWatchTableImpl::rdc_health_clear(rdc_gpu_group_t group_id) { rdc_field_grp_t field_group_id; - do { //< lock guard for thread safe + do { //< lock guard for thread safe std::lock_guard guard(watch_mutex_); auto health = health_watch_table_.find(group_id); if (health == health_watch_table_.end()) { @@ -1219,8 +1130,8 @@ void RdcWatchTableImpl::debug_status() { for (const auto& p : hite->second.fields) { strstream << "<" << p.first << "," << p.second << "> "; } - RDC_LOG(RDC_DEBUG, - "group id : " << hite->first << " components : " << hite->second.components << " fields : " << strstream.str()); + RDC_LOG(RDC_DEBUG, "group id : " << hite->first << " components : " << hite->second.components + << " fields : " << strstream.str()); } if (fields_to_watch_.size() > 0) { diff --git a/projects/rdc/rdc_libs/rdc/src/SmiUtils.cc b/projects/rdc/rdc_libs/rdc/src/SmiUtils.cc index 8f85f591a7..27a59f3344 100644 --- a/projects/rdc/rdc_libs/rdc/src/SmiUtils.cc +++ b/projects/rdc/rdc_libs/rdc/src/SmiUtils.cc @@ -23,6 +23,7 @@ THE SOFTWARE. #include "rdc_lib/impl/SmiUtils.h" #include +#include #include #include "amd_smi/amdsmi.h" @@ -79,44 +80,59 @@ rdc_status_t Smi2RdcError(amdsmi_status_t rsmi) { amdsmi_status_t get_processor_handle_from_id(uint32_t gpu_id, amdsmi_processor_handle* processor_handle) { - uint32_t socket_count; - uint32_t processor_count; - auto ret = amdsmi_get_socket_handles(&socket_count, nullptr); + uint32_t socket_count = 0; + amdsmi_status_t ret = amdsmi_get_socket_handles(&socket_count, nullptr); if (ret != AMDSMI_STATUS_SUCCESS) { return ret; } - std::vector sockets(socket_count); - std::vector all_processors{}; - ret = amdsmi_get_socket_handles(&socket_count, sockets.data()); - for (auto& socket : sockets) { - ret = amdsmi_get_processor_handles(socket, &processor_count, nullptr); - if (ret != AMDSMI_STATUS_SUCCESS) { - return ret; - } - std::vector processors(processor_count); - ret = amdsmi_get_processor_handles(socket, &processor_count, processors.data()); - if (ret != AMDSMI_STATUS_SUCCESS) { - return ret; - } - for (auto& processor : processors) { - processor_type_t processor_type = {}; - ret = amdsmi_get_processor_type(processor, &processor_type); - if (processor_type != AMDSMI_PROCESSOR_TYPE_AMD_GPU) { - RDC_LOG(RDC_ERROR, "Expect AMD_GPU device type!"); - return AMDSMI_STATUS_NOT_SUPPORTED; - } - all_processors.push_back(processor); - } + std::vector sockets(socket_count); + ret = amdsmi_get_socket_handles(&socket_count, sockets.data()); + if (ret != AMDSMI_STATUS_SUCCESS) { + return ret; } - if (gpu_id >= all_processors.size()) { + std::vector> procs_by_socket; + procs_by_socket.resize(socket_count); + + for (size_t s = 0; s < sockets.size(); s++) { + uint32_t proc_count = 0; + ret = amdsmi_get_processor_handles(sockets[s], &proc_count, nullptr); + if (ret != AMDSMI_STATUS_SUCCESS) { + return ret; + } + + std::vector procs(proc_count); + ret = amdsmi_get_processor_handles(sockets[s], &proc_count, procs.data()); + if (ret != AMDSMI_STATUS_SUCCESS) { + return ret; + } + + for (auto& proc : procs) { + processor_type_t proc_type = {}; + ret = amdsmi_get_processor_type(proc, &proc_type); + if (proc_type != AMDSMI_PROCESSOR_TYPE_AMD_GPU) { + return AMDSMI_STATUS_NOT_SUPPORTED; + } + } + + procs_by_socket[s] = procs; + } + + rdc_entity_info_t info = rdc_get_info_from_entity_index(gpu_id); + uint32_t socket_index = info.device_index; + uint32_t instance_index = info.instance_index; + + if (socket_index >= procs_by_socket.size()) { return AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS; } - // Get processor handle from GPU id - *processor_handle = all_processors[gpu_id]; + const auto& handles = procs_by_socket[socket_index]; + if (instance_index >= handles.size()) { + return AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS; + } + *processor_handle = handles[instance_index]; return AMDSMI_STATUS_SUCCESS; } @@ -141,5 +157,69 @@ amdsmi_status_t get_processor_count(uint32_t& all_processor_count) { return AMDSMI_STATUS_SUCCESS; } +amdsmi_status_t get_socket_handles(std::vector& sockets) { + uint32_t socket_count = 0; + amdsmi_status_t ret = amdsmi_get_socket_handles(&socket_count, nullptr); + if (ret != AMDSMI_STATUS_SUCCESS) { + return ret; + } + + sockets.resize(socket_count); + + ret = amdsmi_get_socket_handles(&socket_count, sockets.data()); + + return ret; +} + +amdsmi_status_t get_processor_handles(amdsmi_socket_handle socket, + std::vector& processors) { + uint32_t processor_count = 0; + amdsmi_status_t ret = amdsmi_get_processor_handles(socket, &processor_count, nullptr); + if (ret != AMDSMI_STATUS_SUCCESS) { + return ret; + } + + processors.resize(processor_count); + + ret = amdsmi_get_processor_handles(socket, &processor_count, processors.data()); + + return ret; +} + +amdsmi_status_t get_kfd_partition_id(amdsmi_processor_handle proc, uint32_t* partition_id) { + amdsmi_kfd_info_t kfd_info = {}; + amdsmi_status_t ret = amdsmi_get_gpu_kfd_info(proc, &kfd_info); + if (ret != AMDSMI_STATUS_SUCCESS) { + return ret; + } + *partition_id = kfd_info.current_partition_id; + return ret; +} + +amdsmi_status_t get_metrics_info(amdsmi_processor_handle proc, amdsmi_gpu_metrics_t* metrics) { + amdsmi_status_t ret = amdsmi_get_gpu_metrics_info(proc, metrics); + return ret; +} + +amdsmi_status_t get_num_partition(uint32_t index, uint16_t* num_partition) { + // Get the processor handle for the physical device. + amdsmi_processor_handle proc_handle; + amdsmi_status_t ret = get_processor_handle_from_id(index, &proc_handle); + if (ret != AMDSMI_STATUS_SUCCESS) { + return ret; + } + + amdsmi_gpu_metrics_t metrics; + memset(&metrics, 0, sizeof(metrics)); + ret = get_metrics_info(proc_handle, &metrics); + if (ret != AMDSMI_STATUS_SUCCESS) { + return ret; + } + + *num_partition = metrics.num_partition; + + return ret; +} + } // namespace rdc } // namespace amd diff --git a/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc b/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc index a18e0e2b8e..db75d634e6 100644 --- a/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc +++ b/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc @@ -1075,5 +1075,41 @@ rdc_status_t RdcStandaloneHandler::rdc_link_status_get(rdc_link_status_t* result return RDC_ST_OK; } +rdc_status_t RdcStandaloneHandler::rdc_get_num_partition(uint32_t index, uint16_t* num_partition) { + ::rdc::GetNumPartitionRequest request; + request.set_gpu_index(index); + ::rdc::GetNumPartitionResponse reply; + ::grpc::ClientContext context; + + ::grpc::Status status = stub_->GetNumPartition(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + if (err_status != RDC_ST_OK) { + return err_status; + } + *num_partition = reply.num_partition(); + return RDC_ST_OK; +} + +rdc_status_t RdcStandaloneHandler::rdc_instance_profile_get( + uint32_t entity_index, rdc_instance_resource_type_t resource_type, + rdc_resource_profile_t* profile) { + ::rdc::GetInstanceProfileRequest request; + request.set_entity_index(entity_index); + request.set_resource_type(static_cast(resource_type)); + + ::rdc::GetInstanceProfileResponse reply; + ::grpc::ClientContext context; + + ::grpc::Status status = stub_->GetInstanceProfile(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + if (err_status != RDC_ST_OK) { + return err_status; + } + + profile->partition_resource = reply.partition_resource(); + profile->num_partitions_share_resource = reply.num_partitions_share_resource(); + return RDC_ST_OK; +} + } // namespace rdc } // namespace amd diff --git a/projects/rdc/rdci/CMakeLists.txt b/projects/rdc/rdci/CMakeLists.txt index f7ac5cbe42..551b17eea2 100644 --- a/projects/rdc/rdci/CMakeLists.txt +++ b/projects/rdc/rdci/CMakeLists.txt @@ -56,7 +56,8 @@ set(INC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/include") include_directories(${INC_DIR} ${PROJECT_SOURCE_DIR}/include "${GRPC_ROOT}/include" - ${PROJECT_SOURCE_DIR}) + ${PROJECT_SOURCE_DIR} + ${AMD_SMI_INCLUDE_DIR}) set(RDCI_SRC_LIST "${COMMON_DIR}/rdc_fields_supported.cc" diff --git a/projects/rdc/rdci/include/RdciDiscoverySubSystem.h b/projects/rdc/rdci/include/RdciDiscoverySubSystem.h index 212e056591..ea4cca1a90 100644 --- a/projects/rdc/rdci/include/RdciDiscoverySubSystem.h +++ b/projects/rdc/rdci/include/RdciDiscoverySubSystem.h @@ -37,7 +37,9 @@ class RdciDiscoverySubSystem : public RdciSubSystem { bool show_help_; void show_help() const; bool is_list_; + bool is_partition_; void show_attributes(); + void show_attributes_with_partitions(); bool show_version_; void show_version(); }; diff --git a/projects/rdc/rdci/include/RdciDmonSubSystem.h b/projects/rdc/rdci/include/RdciDmonSubSystem.h index 5870c8d9f3..bd439bb2dc 100644 --- a/projects/rdc/rdci/include/RdciDmonSubSystem.h +++ b/projects/rdc/rdci/include/RdciDmonSubSystem.h @@ -43,6 +43,9 @@ class RdciDmonSubSystem : public RdciSubSystem { void show_field_usage() const; void clean_up(); + // Need to resolve gpu indexes after process is called + void resolve_gpu_indexes(); + void create_temp_group(); void create_temp_field_group(); @@ -64,6 +67,7 @@ class RdciDmonSubSystem : public RdciSubSystem { std::map options_; std::vector field_ids_; + std::string raw_gpu_indexes_; std::vector gpu_indexes_; bool need_cleanup_; uint64_t latest_time_stamp_; diff --git a/projects/rdc/rdci/src/RdciDiscoverySubSystem.cc b/projects/rdc/rdci/src/RdciDiscoverySubSystem.cc index 48a06b7b8a..02ca20964a 100644 --- a/projects/rdc/rdci/src/RdciDiscoverySubSystem.cc +++ b/projects/rdc/rdci/src/RdciDiscoverySubSystem.cc @@ -24,6 +24,10 @@ THE SOFTWARE. #include #include +#include +#include +#include + #include "rdc/rdc.h" #include "rdc/rdc_private.h" #include "rdc_lib/RdcException.h" @@ -33,22 +37,23 @@ namespace amd { namespace rdc { RdciDiscoverySubSystem::RdciDiscoverySubSystem() - : show_help_(false), - is_list_(false), - show_version_(false) {} + : show_help_(false), is_list_(false), is_partition_(false), show_version_(false) {} void RdciDiscoverySubSystem::parse_cmd_opts(int argc, char** argv) { const int HOST_OPTIONS = 1000; const int JSON_OPTIONS = 1001; - const struct option long_options[] = { - {"host", required_argument, nullptr, HOST_OPTIONS}, {"help", optional_argument, nullptr, 'h'}, - {"unauth", optional_argument, nullptr, 'u'}, {"list", optional_argument, nullptr, 'l'}, - {"json", optional_argument, nullptr, JSON_OPTIONS}, {"version", optional_argument, nullptr, 'v'}, {nullptr, 0, nullptr, 0}}; + const struct option long_options[] = {{"host", required_argument, nullptr, HOST_OPTIONS}, + {"help", optional_argument, nullptr, 'h'}, + {"unauth", optional_argument, nullptr, 'u'}, + {"list", optional_argument, nullptr, 'l'}, + {"json", optional_argument, nullptr, JSON_OPTIONS}, + {"version", optional_argument, nullptr, 'v'}, + {nullptr, 0, nullptr, 0}}; int option_index = 0; int opt = 0; - while ((opt = getopt_long(argc, argv, "hluv", long_options, &option_index)) != -1) { + while ((opt = getopt_long(argc, argv, "hliuv", long_options, &option_index)) != -1) { switch (opt) { case HOST_OPTIONS: ip_port_ = optarg; @@ -65,6 +70,9 @@ void RdciDiscoverySubSystem::parse_cmd_opts(int argc, char** argv) { case 'l': is_list_ = true; break; + case 'i': + is_partition_ = true; + break; case 'v': show_version_ = true; break; @@ -74,9 +82,10 @@ void RdciDiscoverySubSystem::parse_cmd_opts(int argc, char** argv) { } } - if ((!is_list_ && !show_version_) || (is_list_ && show_version_)) { + int opCount = (is_list_ ? 1 : 0) + (is_partition_ ? 1 : 0) + (show_version_ ? 1 : 0); + if (opCount != 1) { show_help(); - throw RdcException(RDC_ST_BAD_PARAMETER, "Need to specify operations"); + throw RdcException(RDC_ST_BAD_PARAMETER, "Need to specify exactly one operation"); } } @@ -93,6 +102,8 @@ void RdciDiscoverySubSystem::show_help() const { << "Output using json.\n"; std::cout << " -l --list list GPU discovered" << " on the system\n"; + std::cout << " -i --gpu-instance list GPU discovered" + << " on the system with partitions\n"; std::cout << " -v --version Display version information of the" << " the server and libraries used by the server\n"; } @@ -108,7 +119,7 @@ void RdciDiscoverySubSystem::show_attributes() { if (is_json_output()) { std::cout << "\"gpus\" : [], \"status\": \"ok\""; } else { - std::cout << "No GPUs find on the system\n"; + std::cout << "No GPUs found on the system\n"; } return; } @@ -145,6 +156,145 @@ void RdciDiscoverySubSystem::show_attributes() { } } +void RdciDiscoverySubSystem::show_attributes_with_partitions() { + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES]; + uint32_t count = 0; + rdc_status_t result = rdc_device_get_all(rdc_handle_, gpu_index_list, &count); + if (result != RDC_ST_OK) { + throw RdcException(result, "Fail to get all devices"); + } + + if (count == 0) { + if (is_json_output()) + std::cout << "\"gpus\" : [], \"status\": \"ok\""; + else + std::cout << "No GPUs found on the system\n"; + return; + } + + // Print header. + if (!is_json_output()) { + std::cout << count << " GPUs found." << std::endl; + std::cout << "---------------------------------------------------------------------" + << std::endl; + std::cout << std::setw(12) << std::left << "GPU Index" << std::setw(20) << "Instance Index" + << std::setw(25) << "Device Information" << std::setw(8) << "XCC" << std::setw(8) + << "DECODER" << std::endl; + } else { + std::cout << "\"gpus\" : ["; + } + + // Loop over each GPU. + for (uint32_t i = 0; i < count; i++) { + rdc_device_attributes_t attribute; + result = rdc_device_get_attributes(rdc_handle_, gpu_index_list[i], &attribute); + if (result != RDC_ST_OK) return; + + // Build physical device entity info. + rdc_entity_info_t phys_info; + phys_info.device_index = i; + phys_info.instance_index = 0; + phys_info.entity_role = RDC_DEVICE_ROLE_PHYSICAL; + phys_info.device_type = RDC_DEVICE_TYPE_GPU; + uint32_t phys_entity_index = rdc_get_entity_index_from_info(phys_info); + + rdc_resource_profile_t phys_xcc = {}; + rdc_resource_profile_t phys_decoder_profile = {}; + result = + rdc_instance_profile_get(rdc_handle_, phys_entity_index, RDC_ACCELERATOR_XCC, &phys_xcc); + result = rdc_instance_profile_get(rdc_handle_, phys_entity_index, RDC_ACCELERATOR_DECODER, + &phys_decoder_profile); + + std::string phys_xcc_str = std::to_string(phys_xcc.partition_resource); + std::string phys_decoder_str = std::to_string(phys_decoder_profile.partition_resource); + + if (!is_json_output()) { + std::cout << std::setw(12) << std::left << i << std::setw(20) << "" << std::setw(25) + << attribute.device_name << std::setw(8) << phys_xcc_str << std::setw(8) + << phys_decoder_str << std::endl; + } else { + std::cout << "{\"gpu_index\": \"" << i << "\", " + << "\"device_name\": \"" << attribute.device_name << "\", " + << "\"physical\": {" + << "\"xcc\": \"" << phys_xcc_str << "\", " + << "\"decoder\": \"" << phys_decoder_str << "\" " + << "}"; + } + + uint16_t num_partition = 0; + rdc_status_t result = rdc_get_num_partition(rdc_handle_, i, &num_partition); + if (result != RDC_ST_OK) { + return; + } + + // A partitionable device not in partitionable mode will have metrics.num_partition=1 + // Where as, a non-partitionable device will have metrics.num_partition = UINT16_MAX + if (num_partition != UINT16_MAX && num_partition > 1) { + if (is_json_output()) { + std::cout << ", \"partitions\": ["; + } + for (uint32_t pid = 0; pid < num_partition; pid++) { + std::string instance_str = "g" + std::to_string(i) + "." + std::to_string(pid); + + rdc_entity_info_t part_info; + part_info.device_index = i; + part_info.instance_index = pid; + part_info.entity_role = RDC_DEVICE_ROLE_PARTITION_INSTANCE; + part_info.device_type = RDC_DEVICE_TYPE_GPU; + uint32_t part_entity_index = rdc_get_entity_index_from_info(part_info); + + rdc_resource_profile_t part_xcc = {}; + rdc_resource_profile_t part_decoder = {}; + result = rdc_instance_profile_get(rdc_handle_, part_entity_index, RDC_ACCELERATOR_XCC, + &part_xcc); + result = rdc_instance_profile_get(rdc_handle_, part_entity_index, RDC_ACCELERATOR_DECODER, + &part_decoder); + + std::string part_decoder_str = std::to_string(part_decoder.partition_resource); + std::string part_xcc_str = std::to_string(part_xcc.partition_resource); + std::string starColumn = " "; + if (part_decoder.num_partitions_share_resource > 1) { + starColumn = "*"; + } + + if (!is_json_output()) { + std::cout << std::setw(12) << "" << std::setw(20) << instance_str << std::setw(25) << "" + << std::setw(7) << part_xcc_str << std::setw(1) << starColumn << std::setw(8) + << part_decoder_str << std::endl; + } else { + std::string decoder_shared = + (part_decoder.num_partitions_share_resource > 1) ? "true" : "false"; + std::cout << "{\"instance_index\": \"" << instance_str << "\", " + << "\"xcc\": \"" << part_xcc_str << "\", " + << "\"decoder\": \"" << part_decoder_str << "\", " + << "\"decoder_shared\": " << decoder_shared << "}"; + + if (pid != num_partition - 1) { + std::cout << ","; + } else { + std::cout << "]"; + } + } + } + } + + if (is_json_output()) { + if (i != count - 1) + std::cout << "},"; + else + std::cout << "}"; + } + } + + if (!is_json_output()) { + std::cout << "---------------------------------------------------------------------" + << std::endl; + std::cout << "* if the resource is shared" << std::endl; + } else { + std::cout << ']'; + } +} + void RdciDiscoverySubSystem::show_version() { rdc_component_version_t smiv; rdc_status_t result = rdc_device_get_component_version(rdc_handle_, RDC_AMDMSI_COMPONENT, &smiv); @@ -155,18 +305,21 @@ void RdciDiscoverySubSystem::show_version() { mixed_component_version_t rdcdv; uint32_t ret = get_mixed_component_version(rdc_handle_, RDCD_COMPONENT, &rdcdv); if (ret) { - std::cout << "get rdcd version fail"<< std::endl; + std::cout << "get rdcd version fail" << std::endl; return; } - if (is_json_output()) { + if (is_json_output()) { std::cout << "\"version\" : "; std::cout << '{'; - std::cout << "\"rdcd\": " << "\"" << rdcdv.version << "\", "; - std::cout << "\"amdsmi_lib\": " << "\"" << smiv.version << "\""; + std::cout << "\"rdcd\": " + << "\"" << rdcdv.version << "\", "; + std::cout << "\"amdsmi_lib\": " + << "\"" << smiv.version << "\""; std::cout << '}'; } else { - std::cout << "RDCD : " << rdcdv.version << " | " << "AMDSMI Library : " << smiv.version << std::endl; + std::cout << "RDCD : " << rdcdv.version << " | " + << "AMDSMI Library : " << smiv.version << std::endl; } return; @@ -181,6 +334,10 @@ void RdciDiscoverySubSystem::process() { return show_attributes(); } + if (is_partition_) { + return show_attributes_with_partitions(); + } + if (show_version_) { return show_version(); } diff --git a/projects/rdc/rdci/src/RdciDmonSubSystem.cc b/projects/rdc/rdci/src/RdciDmonSubSystem.cc index 371d42ae3e..839fc5f808 100644 --- a/projects/rdc/rdci/src/RdciDmonSubSystem.cc +++ b/projects/rdc/rdci/src/RdciDmonSubSystem.cc @@ -26,6 +26,8 @@ THE SOFTWARE. #include #include +#include +#include #include #include #include @@ -62,6 +64,15 @@ void RdciDmonSubSystem::set_terminating(int sig) { } } +std::string entity_to_string(uint32_t entity_index) { + rdc_entity_info_t info = rdc_get_info_from_entity_index(entity_index); + + if (info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) { + return "g" + std::to_string(info.device_index) + "." + std::to_string(info.instance_index); + } + return std::to_string(info.device_index); +} + void RdciDmonSubSystem::parse_cmd_opts(int argc, char** argv) { const int HOST_OPTIONS = 1000; const int LIST_ALL_FIELDS_OPT = 1001; @@ -174,15 +185,6 @@ void RdciDmonSubSystem::parse_cmd_opts(int argc, char** argv) { if (gpu_indexes == "") { show_help(); throw RdcException(RDC_ST_BAD_PARAMETER, "Need to specify the GPUs or group id"); - } else { - std::vector vec_ids = split_string(gpu_indexes, ','); - for (uint32_t i = 0; i < vec_ids.size(); i++) { - if (!IsNumber(vec_ids[i])) { - throw RdcException(RDC_ST_BAD_PARAMETER, - "The GPU index " + vec_ids[i] + " needs to be a number"); - } - gpu_indexes_.push_back(std::stoi(vec_ids[i])); - } } } @@ -207,6 +209,9 @@ void RdciDmonSubSystem::parse_cmd_opts(int argc, char** argv) { if (options_.find(OPTIONS_COUNT) == options_.end()) { options_.insert({OPTIONS_COUNT, std::numeric_limits::max()}); } + + // Store gpu indexes to parse later + raw_gpu_indexes_ = gpu_indexes; } void RdciDmonSubSystem::show_help() const { @@ -272,8 +277,15 @@ void RdciDmonSubSystem::create_temp_group() { for (uint32_t i = 0; i < gpu_indexes_.size(); i++) { result = rdc_group_gpu_add(rdc_handle_, group_id, gpu_indexes_[i]); if (result != RDC_ST_OK) { - throw RdcException(result, - "Fail to add " + std::to_string(gpu_indexes_[i]) + " to the dmon group."); + rdc_entity_info_t info = rdc_get_info_from_entity_index(gpu_indexes_[i]); + std::string info_str; + if (info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) { + info_str = + "g" + std::to_string(info.device_index) + "." + std::to_string(info.instance_index); + } else { + info_str = std::to_string(info.device_index); + } + throw RdcException(result, "Fail to add " + info_str + " to the dmon group."); } } options_.insert({OPTIONS_GROUP_ID, group_id}); @@ -301,6 +313,73 @@ void RdciDmonSubSystem::create_temp_field_group() { options_.insert({OPTIONS_FIELD_GROUP_ID, group_id}); } +void RdciDmonSubSystem::resolve_gpu_indexes() { + uint32_t device_list[RDC_MAX_NUM_DEVICES]; + uint32_t count = 0; + rdc_status_t res = rdc_device_get_all(rdc_handle_, device_list, &count); + if (res != RDC_ST_OK) { + throw RdcException(res, "Failed to get all devices"); + } + + std::vector vec_ids = split_string(raw_gpu_indexes_, ','); + for (uint32_t i = 0; i < vec_ids.size(); i++) { + if (rdc_is_partition_string(vec_ids[i].c_str())) { + uint32_t logicalPhysicalGpu; + uint32_t partition; + if (!rdc_parse_partition_string(vec_ids[i].c_str(), &logicalPhysicalGpu, &partition)) { + throw RdcException(RDC_ST_BAD_PARAMETER, "Invalid partition format: " + vec_ids[i]); + } + + if (logicalPhysicalGpu >= count) { + throw RdcException(RDC_ST_BAD_PARAMETER, + "GPU " + std::to_string(logicalPhysicalGpu) + " is out of range"); + } + + uint32_t physicalGpu = device_list[logicalPhysicalGpu]; + + uint16_t num_partitions = 0; + rdc_status_t st = rdc_get_num_partition(rdc_handle_, physicalGpu, &num_partitions); + if (st != RDC_ST_OK) { + throw RdcException(st, + "Failed to get partition info for GPU " + std::to_string(physicalGpu)); + } + + if (num_partitions == UINT16_MAX || num_partitions <= 1) { + if (partition != 0) { + throw RdcException(RDC_ST_BAD_PARAMETER, "GPU " + std::to_string(physicalGpu) + + " is not partitioned, so partition " + + std::to_string(partition) + " is invalid"); + } + } else { + if (partition >= num_partitions) { + throw RdcException(RDC_ST_BAD_PARAMETER, + "GPU " + std::to_string(physicalGpu) + " supports only " + + std::to_string(num_partitions) + " partitions, partition " + + std::to_string(partition) + " is invalid"); + } + } + + rdc_entity_info_t phys_info; + phys_info.device_index = physicalGpu; + phys_info.instance_index = partition; + phys_info.entity_role = RDC_DEVICE_ROLE_PARTITION_INSTANCE; + phys_info.device_type = RDC_DEVICE_TYPE_GPU; + uint32_t phys_entity_index = rdc_get_entity_index_from_info(phys_info); + gpu_indexes_.push_back(phys_entity_index); + } else if (IsNumber(vec_ids[i])) { + uint32_t logicalIndex = std::stoi(vec_ids[i]); + if (logicalIndex >= count) { + throw RdcException(RDC_ST_BAD_PARAMETER, + "GPU " + std::to_string(logicalIndex) + " is out of range"); + } + gpu_indexes_.push_back(std::stoi(vec_ids[i])); + } else { + throw RdcException(RDC_ST_BAD_PARAMETER, "The GPU index " + vec_ids[i] + + " needs to be a number or a valid partition"); + } + } +} + void RdciDmonSubSystem::show_field_usage() const { std::cout << "Supported fields Ids:" << std::endl; @@ -430,6 +509,8 @@ void RdciDmonSubSystem::process() { rdc_group_info_t group_info; rdc_field_group_info_t field_info; + resolve_gpu_indexes(); + // Create a temporary group/field if pass as GPU indexes or field ids create_temp_group(); create_temp_field_group(); @@ -516,7 +597,8 @@ void RdciDmonSubSystem::process() { print_and_clr_notif_pq(¬if_pq, show_timpstamps_); for (uint32_t gindex = 0; gindex < group_info.count; gindex++) { - std::cout << group_info.entity_ids[gindex] << "\t"; + std::cout << std::setw(12) << std::left << entity_to_string(group_info.entity_ids[gindex]) + << "\t"; for (uint32_t findex = 0; findex < reg_fields.size(); findex++) { rdc_field_value value; diff --git a/projects/rdc/server/include/rdc/rdc_api_service.h b/projects/rdc/server/include/rdc/rdc_api_service.h index 974c9c3917..55e11c6ca9 100644 --- a/projects/rdc/server/include/rdc/rdc_api_service.h +++ b/projects/rdc/server/include/rdc/rdc_api_service.h @@ -184,6 +184,14 @@ class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service { const ::rdc::ClearConfigRequest* request, ::rdc::ClearConfigResponse* reply) override; + ::grpc::Status GetNumPartition(::grpc::ServerContext* context, + const ::rdc::GetNumPartitionRequest* request, + ::rdc::GetNumPartitionResponse* reply) override; + + ::grpc::Status GetInstanceProfile(::grpc::ServerContext* context, + const ::rdc::GetInstanceProfileRequest* request, + ::rdc::GetInstanceProfileResponse* reply) override; + private: bool copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, ::rdc::GpuUsageInfo* target); rdc_handle_t rdc_handle_; diff --git a/projects/rdc/server/src/rdc_api_service.cc b/projects/rdc/server/src/rdc_api_service.cc index df5d4f4954..466c3e13eb 100644 --- a/projects/rdc/server/src/rdc_api_service.cc +++ b/projects/rdc/server/src/rdc_api_service.cc @@ -1071,7 +1071,7 @@ int RdcAPIServiceImpl::PolicyCallback(rdc_policy_callback_response_t* userData) static_cast<::rdc::TopologyLinkInfo_LinkType>(topology_results.link_infos[i].link_type)); linkinfos->set_p2p_accessible(topology_results.link_infos[i].is_p2p_accessible); } - return ::grpc::Status::OK; + return ::grpc::Status::OK; } ::grpc::Status RdcAPIServiceImpl::SetConfig(::grpc::ServerContext* context, @@ -1140,13 +1140,56 @@ int RdcAPIServiceImpl::PolicyCallback(rdc_policy_callback_response_t* userData) gpulinkstatus->set_link_types( static_cast<::rdc::GpuLinkStatus_LinkTypes>(link_status_results.gpus[i].link_types)); for (uint32_t n = 0; n < link_status_results.gpus[i].num_of_links; n++) { - gpulinkstatus->add_link_states(static_cast<::rdc::GpuLinkStatus_LinkState>( - link_status_results.gpus[i].link_states[n])); + gpulinkstatus->add_link_states( + static_cast<::rdc::GpuLinkStatus_LinkState>(link_status_results.gpus[i].link_states[n])); } } return ::grpc::Status::OK; } +::grpc::Status RdcAPIServiceImpl::GetNumPartition(::grpc::ServerContext* context, + const ::rdc::GetNumPartitionRequest* request, + ::rdc::GetNumPartitionResponse* reply) { + (void)context; + if (!request || !reply) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty request or reply"); + } + + uint32_t gpu_index = request->gpu_index(); + uint16_t num_partition = 0; + rdc_status_t result = rdc_get_num_partition(rdc_handle_, gpu_index, &num_partition); + reply->set_status(result); + if (result == RDC_ST_OK) { + reply->set_num_partition(num_partition); + } + return ::grpc::Status::OK; +} + +::grpc::Status RdcAPIServiceImpl::GetInstanceProfile( + ::grpc::ServerContext* context, const ::rdc::GetInstanceProfileRequest* request, + ::rdc::GetInstanceProfileResponse* reply) { + (void)context; + if (!request || !reply) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty request or reply"); + } + + uint32_t entity_index = request->entity_index(); + uint32_t resource_type = request->resource_type(); + rdc_resource_profile_t profile; + memset(&profile, 0, sizeof(profile)); + + // Call the RDC API that (in embedded mode) uses AMD SMI + rdc_status_t result = + rdc_instance_profile_get(rdc_handle_, entity_index, + static_cast(resource_type), &profile); + reply->set_status(result); + if (result == RDC_ST_OK) { + reply->set_partition_resource(profile.partition_resource); + reply->set_num_partitions_share_resource(profile.num_partitions_share_resource); + } + return ::grpc::Status::OK; +} + } // namespace rdc } // namespace amd