[SWDEV-498711] RDC Partition Implementation (#119)

* [SWDEV-498711] RDC Partition Implementation

Change-Id: Ibfc3709793770537e4c9d36458f34c6b4f461724
Signed-off-by: adapryor <Adam.pryor@amd.com>

[ROCm/rdc commit: 47692d3ed5]
Dieser Commit ist enthalten in:
Pryor, Adam
2025-03-27 14:10:11 -05:00
committet von GitHub
Ursprung 791fa376e9
Commit fe868f6763
29 geänderte Dateien mit 1503 neuen und 358 gelöschten Zeilen
+138
Datei anzeigen
@@ -129,6 +129,11 @@ typedef enum { INTEGER = 0, DOUBLE, STRING, BLOB } rdc_field_type_t;
*/
#define RDC_MAX_NUM_DEVICES 128
/**
* @brief Max number of partitions
*/
#define RDC_MAX_NUM_PARTITIONS 8
/**
* @brief The max fields in a field group
*/
@@ -1617,6 +1622,139 @@ rdc_status_t rdc_config_clear(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_i
const char* get_rocm_path(const char* search_string);
/**
* @brief The device role
*/
typedef enum {
RDC_DEVICE_ROLE_PHYSICAL,
RDC_DEVICE_ROLE_PARTITION_INSTANCE //!< The partition instance
} rdc_device_role_t;
/**
* @brief The device type
*/
typedef enum { RDC_DEVICE_TYPE_GPU, RDC_DEVICE_TYPE_CPU } rdc_device_type_t;
typedef struct {
uint32_t device_index; //!< Physical device index
uint32_t instance_index; //!< Instance or core index
rdc_device_role_t entity_role; //!< Physical device or partition instance
rdc_device_type_t device_type; //!< Type
} rdc_entity_info_t;
/**
* @brief The function to decode the entity info from entity index
* @details
* | 31 30 29| 28 27 | 21 20 19 ... 12 11 | 10 9 8 7 6 5 4 3 2 1 0 |
* |---------|-------|--------------------|---------------------------|
* | Type | Role | Instance | Device |
* |---------|-------|--------------------|---------------------------|
* the 32 bit entity index is crafted based on above structure, this function
* will decode them into a data structure
*
* @param[in] entity_index The entity index.
*
* @retval rdc_entity_info_t is returned for decode structure
*/
rdc_entity_info_t rdc_get_info_from_entity_index(uint32_t entity_index);
/**
* @brief The function to encode the entity info to entity index
* @details
* | 31 30 29| 28 27 | 21 20 19 ... 12 11 | 10 9 8 7 6 5 4 3 2 1 0 |
* |---------|-------|--------------------|---------------------------|
* | Type | Role | Instance | Device |
* |---------|-------|--------------------|---------------------------|
* the 32 bit entity index is crafted based on above structure, this function
* will encode them to index
*
* @param[in] info The entity info to encode.
*
* @retval entity_index is returned
*/
uint32_t rdc_get_entity_index_from_info(rdc_entity_info_t info);
// map from amdsmi_accelerator_partition_resource_type_t
typedef enum {
RDC_ACCELERATOR_XCC = 0,
RDC_ACCELERATOR_ENCODER,
RDC_ACCELERATOR_DECODER,
RDC_ACCELERATOR_DMA,
RDC_ACCELERATOR_JPEG,
RDC_ACCELERATOR_RESOURCE_MAX,
RDC_ACCELERATOR_LAST = RDC_ACCELERATOR_RESOURCE_MAX
} rdc_instance_resource_type_t;
// map from amdsmi_accelerator_partition_resource_profile_t
typedef struct {
rdc_instance_resource_type_t resource_type;
uint32_t partition_resource; // The resources a partition can be used, which may be shared
uint32_t num_partitions_share_resource; // If it is greater than 1, then resource is shared.
} rdc_resource_profile_t;
/**
* @brief Query the resource allocation for a device/instance
*
* @details The profile contains detail information how resource is allocated.
*
* As an example, MI300X has 8 XCCs and 4 Decoders, in DPX mode, the physical device is
* partitioned to 2 instances, so each instance will have 4 XCC and 2 Decoder and they are
* not shared.
* [XCC, 4, 0], [DECODER, 2, 0]
*
* If it is CPX mode, the physical device is partitioned to 8 instances, and each instance
* have 1 XCC and 2 instances are sharing the same decoder.
* [XCC, 1, 0], [DECODER, 1, 1]
*
* If entity_index is the physical device, it should return all resources of the device:
* [XCC, 8, 0], [DECODER, 4, 0]
*
* @param[in] p_rdc_handle The RDC handler.
*
* @param[in] entity_index The GPU index to query. It can be physical device or instance.
*
* @param[in] resource_type Which resource type to query
*
* @param[out] profile The details how the resource is allocated.
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_instance_profile_get(rdc_handle_t p_rdc_handle, uint32_t entity_index,
rdc_instance_resource_type_t resource_type,
rdc_resource_profile_t* profile);
/**
* @brief Get the number of partitions for the specified GPU index.
*
* @param[in] p_rdc_handle The RDC handler.
* @param[in] index The GPU index to query.
* @param[out] num_partition Pointer to a variable to receive the number of partitions.
*
* @retval ::RDC_ST_OK on success.
*/
rdc_status_t rdc_get_num_partition(rdc_handle_t p_rdc_handle, uint32_t index,
uint16_t* num_partition);
/**
* @brief Check if gpuid is partition string
*
* @param[in] s - singular partition string
* @retval bool - if partition string or not
*/
bool rdc_is_partition_string(const char* s);
/**
* @brief Parse partition id into physical gpu and partition
*
* @param[in] s - singular partition string
* @param[out] physicalGpu - socket id
* @param[out] partition - partition id
*
* @retval bool - success
*/
bool rdc_parse_partition_string(const char* s, uint32_t* physicalGpu, uint32_t* partition);
#ifdef __cplusplus
}
#endif // __cplusplus
@@ -0,0 +1,54 @@
/*
Copyright (c) 2025 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef INCLUDE_RDC_LIB_RDCENTITYCODEC_H_
#define INCLUDE_RDC_LIB_RDCENTITYCODEC_H_
#include "rdc/rdc.h"
/*
*
* See rdc.h for description of entity_index
* Shifts and masks help get only the bits in question to decode/encode
*
* Ex, RDC_ENTITY_TYPE_SHIFT = 29 helps shift the 29 irrelevant bits, so we're
* only left with the top 3 type bits.
* Then, the corresponding 3 type bits are anded with the RDC_ENTITY_TYPE_MASK = 0x7
* which = 111 in binary, "copying" the type bits.
*
*
*/
static constexpr uint32_t RDC_ENTITY_TYPE_SHIFT = 29;
static constexpr uint32_t RDC_ENTITY_ROLE_SHIFT = 27;
static constexpr uint32_t RDC_ENTITY_INSTANCE_SHIFT = 11;
static constexpr uint32_t RDC_ENTITY_DEVICE_SHIFT = 0;
static constexpr uint32_t RDC_ENTITY_TYPE_MASK = 0x7; // 3 bits for type.
static constexpr uint32_t RDC_ENTITY_ROLE_MASK = 0x3; // 2 bits for role.
static constexpr uint32_t RDC_ENTITY_INSTANCE_MASK = 0x3FF; // 10 bits for instance.
static constexpr uint32_t RDC_ENTITY_DEVICE_MASK = 0x3FF; // 10 bits for device.
rdc_entity_info_t rdc_get_info_from_entity_index(uint32_t entity_index);
uint32_t rdc_get_entity_index_from_info(rdc_entity_info_t info);
bool rdc_is_partition_string(const char* s);
bool rdc_parse_partition_string(const char* s, uint32_t* physicalGpu, uint32_t* partition);
#endif // INCLUDE_RDC_LIB_RDCENTITYCODEC_H_
+7 -1
Datei anzeigen
@@ -116,7 +116,7 @@ class RdcHandler {
virtual rdc_status_t rdc_health_check(rdc_gpu_group_t group_id,
rdc_health_response_t* response) = 0;
virtual rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) = 0;
// topology API
// topology API
virtual rdc_status_t rdc_device_topology_get(uint32_t gpu_index,
rdc_device_topology_t* results) = 0;
virtual rdc_status_t rdc_link_status_get(rdc_link_status_t* results) = 0;
@@ -131,6 +131,12 @@ class RdcHandler {
// Clear the setting
virtual rdc_status_t rdc_config_clear(rdc_gpu_group_t group_id) = 0;
virtual rdc_status_t rdc_get_num_partition(uint32_t index, uint16_t* num_partition) = 0;
virtual rdc_status_t rdc_instance_profile_get(uint32_t entity_index,
rdc_instance_resource_type_t resource_type,
rdc_resource_profile_t* profile) = 0;
virtual ~RdcHandler() {}
};
@@ -0,0 +1,47 @@
/*
Copyright (c) 2025 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef INCLUDE_RDC_LIB_RDCPARTITION_H_
#define INCLUDE_RDC_LIB_RDCPARTITION_H_
#include <memory>
#include "rdc/rdc.h"
namespace amd {
namespace rdc {
class RdcPartition {
public:
virtual rdc_status_t rdc_instance_profile_get_impl(uint32_t entity_index,
rdc_instance_resource_type_t resource_type,
rdc_resource_profile_t* profile) = 0;
virtual rdc_status_t rdc_get_num_partition_impl(uint32_t index, uint16_t* num_partition) = 0;
virtual ~RdcPartition() {}
};
typedef std::shared_ptr<RdcPartition> RdcPartitionPtr;
} // namespace rdc
} // namespace amd
#endif // INCLUDE_RDC_LIB_RDCPARTITION_H_
@@ -32,6 +32,7 @@ THE SOFTWARE.
#include "rdc_lib/RdcMetricsUpdater.h"
#include "rdc_lib/RdcModuleMgr.h"
#include "rdc_lib/RdcNotification.h"
#include "rdc_lib/RdcPartition.h"
#include "rdc_lib/RdcPolicy.h"
#include "rdc_lib/RdcTopologyLink.h"
#include "rdc_lib/RdcWatchTable.h"
@@ -121,7 +122,7 @@ class RdcEmbeddedHandler final : public RdcHandler {
rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t* response) override;
rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override;
rdc_status_t rdc_device_topology_get(uint32_t gpu_index, rdc_device_topology_t* results) override;
rdc_status_t rdc_link_status_get(rdc_link_status_t* results) override;
// Set one configure
@@ -134,11 +135,18 @@ class RdcEmbeddedHandler final : public RdcHandler {
// Clear the setting
rdc_status_t rdc_config_clear(rdc_gpu_group_t group_id) override;
rdc_status_t rdc_get_num_partition(uint32_t index, uint16_t* num_partition) override;
rdc_status_t rdc_instance_profile_get(uint32_t entity_index,
rdc_instance_resource_type_t resource_type,
rdc_resource_profile_t* profile) override;
explicit RdcEmbeddedHandler(rdc_operation_mode_t op_mode);
~RdcEmbeddedHandler() final;
private:
rdc_status_t get_gpu_gauges(rdc_gpu_gauges_t* gpu_gauges);
RdcPartitionPtr partition_;
RdcGroupSettingsPtr group_settings_;
RdcCacheManagerPtr cache_mgr_;
RdcMetricFetcherPtr metric_fetcher_;
@@ -150,7 +158,6 @@ class RdcEmbeddedHandler final : public RdcHandler {
RdcTopologyLinkPtr topologylink_;
RdcConfigSettingsPtr config_handler_;
std::future<void> updater_;
};
} // namespace rdc
@@ -28,6 +28,7 @@ THE SOFTWARE.
#include <string>
#include "rdc_lib/RdcGroupSettings.h"
#include "rdc_lib/impl/RdcPartitionImpl.h"
namespace amd {
namespace rdc {
@@ -51,7 +52,7 @@ class RdcGroupSettingsImpl : public RdcGroupSettings {
rdc_status_t rdc_group_field_get_all_ids(rdc_field_grp_t field_group_id_list[],
uint32_t* count) override;
RdcGroupSettingsImpl();
explicit RdcGroupSettingsImpl(const RdcPartitionPtr& partition);
private:
std::map<rdc_gpu_group_t, rdc_group_info_t> gpu_group_;
@@ -60,6 +61,7 @@ class RdcGroupSettingsImpl : public RdcGroupSettings {
uint32_t cur_field_group_id_ = 0;
std::mutex group_mutex_;
std::mutex field_group_mutex_;
RdcPartitionPtr partition_;
};
} // namespace rdc
@@ -0,0 +1,44 @@
/*
Copyright (c) 2025 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef INCLUDE_RDC_LIB_IMPL_RDCPARTITIONIMPL_H_
#define INCLUDE_RDC_LIB_IMPL_RDCPARTITIONIMPL_H_
#include <memory>
#include "rdc/rdc.h"
#include "rdc_lib/RdcPartition.h"
namespace amd {
namespace rdc {
class RdcPartitionImpl : public RdcPartition {
public:
rdc_status_t rdc_instance_profile_get_impl(uint32_t entity_index,
rdc_instance_resource_type_t resource_type,
rdc_resource_profile_t* profile);
rdc_status_t rdc_get_num_partition_impl(uint32_t index, uint16_t* num_partition);
};
} // namespace rdc
} // namespace amd
#endif // INCLUDE_RDC_LIB_IMPL_RDCPARTITIONIMPL_H_
@@ -126,9 +126,15 @@ class RdcStandaloneHandler : public RdcHandler {
rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t* response) override;
rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override;
rdc_status_t rdc_device_topology_get(uint32_t gpu_index, rdc_device_topology_t* results) override;
rdc_status_t rdc_link_status_get(rdc_link_status_t* results) override;
rdc_status_t rdc_get_num_partition(uint32_t index, uint16_t* num_partition) override;
rdc_status_t rdc_instance_profile_get(uint32_t entity_index,
rdc_instance_resource_type_t resource_type,
rdc_resource_profile_t* profile) override;
explicit RdcStandaloneHandler(const char* ip_and_port, const char* root_ca,
const char* client_cert, const char* client_key);
@@ -23,6 +23,8 @@ THE SOFTWARE.
#ifndef INCLUDE_RDC_LIB_IMPL_RSMIUTILS_H_
#define INCLUDE_RDC_LIB_IMPL_RSMIUTILS_H_
#include <vector>
#include "amd_smi/amdsmi.h"
#include "rdc/rdc.h"
@@ -33,6 +35,12 @@ rdc_status_t Smi2RdcError(amdsmi_status_t rsmi);
amdsmi_status_t get_processor_handle_from_id(uint32_t gpu_id,
amdsmi_processor_handle* processor_handle);
amdsmi_status_t get_processor_count(uint32_t& all_processor_count);
amdsmi_status_t get_socket_handles(std::vector<amdsmi_socket_handle>& sockets);
amdsmi_status_t get_processor_handles(amdsmi_socket_handle socket,
std::vector<amdsmi_processor_handle>& processors);
amdsmi_status_t get_kfd_partition_id(amdsmi_processor_handle proc, uint32_t* partition_id);
amdsmi_status_t get_metrics_info(amdsmi_processor_handle proc, amdsmi_gpu_metrics_t* metrics);
amdsmi_status_t get_num_partition(uint32_t index, uint16_t* num_partition);
} // namespace rdc
} // namespace amd
+32
Datei anzeigen
@@ -219,6 +219,13 @@ service RdcAPI {
// rdc_status_t GetLinkStatus()
rpc GetLinkStatus(Empty) returns (GetLinkStatusResponse) {}
// Get number of partitions
rpc GetNumPartition(GetNumPartitionRequest) returns (GetNumPartitionResponse);
// Get instance profile of gpu
rpc GetInstanceProfile(GetInstanceProfileRequest) returns (GetInstanceProfileResponse);
}
message Empty {
@@ -804,3 +811,28 @@ message ClearConfigRequest {
message ClearConfigResponse {
uint32 status = 1;
}
// Request for getting the number of partitions for a given GPU index.
message GetNumPartitionRequest {
// The GPU index for which to query the number of partitions.
uint32 gpu_index = 1;
}
// Response for getting the number of partitions.
message GetNumPartitionResponse {
// Status of the operation, following RDC_ST_* codes.
uint32 status = 1;
// Number of partitions for the given GPU.
uint32 num_partition = 2;
}
message GetInstanceProfileRequest {
uint32 entity_index = 1;
uint32 resource_type = 2;
}
message GetInstanceProfileResponse {
uint32 status = 1;
uint32 partition_resource = 2;
uint32 num_partitions_share_resource = 3;
}
+43 -3
Datei anzeigen
@@ -1,6 +1,7 @@
import os,time
from rdc_bootstrap import *
from RdcUtil import RdcUtil
from typing import Dict
default_field_ids = [
rdc_field_t.RDC_FI_GPU_MEMORY_USAGE,
@@ -26,10 +27,18 @@ default_unit_coverter = {
rdc_field_t.RDC_FI_GPU_TEMP: 0.001, # degree
}
class rdc_entity_info_t(Structure):
_fields_ = [
("device_type", c_uint32),
("entity_role", c_uint32),
("instance_index", c_uint32),
("device_index", c_uint32),
]
class RdcReader:
# To run the RDC in embedded mode, set the ip_port = None
def __init__(self, ip_port = "localhost:50051", field_ids = default_field_ids,
unit_converter: dict[int, float] = default_unit_coverter,
unit_converter: Dict[int, float] = default_unit_coverter,
update_freq = 10000000, max_keep_age = 3600.0 , max_keep_samples = 1000,
field_group_name = "rdc_reader_field_group", gpu_group_name = "rdc_reader_gpu_group",
gpu_indexes = None, root_ca = "/etc/rdc/client/certs/rdc_cacert.pem",
@@ -44,6 +53,11 @@ class RdcReader:
self.unit_converter = unit_converter
self.rdc_handle = c_void_p()
rdc.rdc_get_entity_index_from_info.argtypes = [rdc_entity_info_t]
rdc.rdc_get_entity_index_from_info.restype = c_uint32
rdc.rdc_get_info_from_entity_index.argtypes = [c_uint32]
rdc.rdc_get_info_from_entity_index.restype = rdc_entity_info_t
self.is_standalone = True
if not ip_port: # embedded
self.is_standalone = False
@@ -69,7 +83,25 @@ class RdcReader:
if gpu_indexes == None:
self.gpu_indexes = self.rdc_util.get_all_gpu_indexes(self.rdc_handle)
else:
self.gpu_indexes = gpu_indexes
self.gpu_indexes = []
for idx in gpu_indexes:
idx_str = str(idx)
encoded = idx_str.encode("utf-8")
phys_gpu = ctypes.c_uint32()
part_idx = ctypes.c_uint32()
if rdc.rdc_is_partition_string(encoded):
rc = rdc.rdc_parse_partition_string(encoded, ctypes.byref(phys_gpu), ctypes.byref(part_idx))
if not rc:
raise Exception("Rdc failed to parse partition string")
info = rdc_entity_info_t()
info.device_type = 0 #RDC_DEVICE_TYPE_GPU
info.entity_role = 1 #RDC_DEVICE_ROLE_PARTITION
info.instance_index = part_idx
info.device_index = phys_gpu
entity = rdc.rdc_get_entity_index_from_info(info)
self.gpu_indexes.append(entity)
else:
self.gpu_indexes.append(int(idx_str))
self.gpu_group_id, gpu_group_created = self.rdc_util.create_gpu_group(self.rdc_handle, self.gpu_group_name, self.gpu_indexes)
# Create the field group
@@ -140,8 +172,16 @@ class RdcReader:
def handle_field(self, gpu_index, value):
info = rdc.rdc_get_info_from_entity_index(gpu_index)
if info.entity_role == 1: #RDC_DEVICE_ROLE_PARTITION_INSTANCE
gpu_str = f"g{info.device_index}.{info.instance_index}"
else:
gpu_str = str(info.device_index)
field_name = self.rdc_util.field_id_string(value.field_id)
print("%d %d:%d %s:%d" % (value.ts, gpu_index, value.field_id.value, field_name, value.value.l_int))
print("%d %s:%d %s:%d" % (value.ts, gpu_str, value.field_id.value, field_name, value.value.l_int))
if __name__ == '__main__':
@@ -12,12 +12,14 @@ set(BOOTSTRAP_LIB_SRC_LIST
"${COMMON_DIR}/rdc_fields_supported.cc"
"${SRC_DIR}/RdcBootStrap.cc"
"${SRC_DIR}/RdcLibraryLoader.cc"
"${SRC_DIR}/RdcLogger.cc")
"${SRC_DIR}/RdcLogger.cc"
"${SRC_DIR}/RdcEntityCodec.cc")
set(BOOTSTRAP_LIB_INC_LIST
"${COMMON_DIR}/rdc_fields_supported.h"
"${INC_DIR}/RdcHandler.h"
"${INC_DIR}/RdcLibraryLoader.h"
"${INC_DIR}/RdcLogger.h"
"${INC_DIR}/RdcEntityCodec.h"
"${INC_DIR}/rdc_common.h"
"${PROJECT_SOURCE_DIR}/include/rdc/rdc.h")
message("BOOTSTRAP_LIB_INC_LIST=${BOOTSTRAP_LIB_INC_LIST}")
@@ -532,8 +532,26 @@ rdc_status_t rdc_link_status_get(rdc_handle_t p_rdc_handle, rdc_link_status_t* r
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_link_status_get(results);
}
rdc_status_t rdc_get_num_partition(rdc_handle_t p_rdc_handle, uint32_t index,
uint16_t* num_partition) {
if (!p_rdc_handle || !num_partition) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_link_status_get(results);
->rdc_get_num_partition(index, num_partition);
}
rdc_status_t rdc_instance_profile_get(rdc_handle_t p_rdc_handle, uint32_t entity_index,
rdc_instance_resource_type_t resource_type,
rdc_resource_profile_t* profile) {
if (!p_rdc_handle || !profile) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_instance_profile_get(entity_index, resource_type, profile);
}
const char * get_rocm_path(const char * search_string) {
@@ -573,4 +591,3 @@ const char * get_rocm_path(const char * search_string) {
return rocm_path.c_str();
}
@@ -0,0 +1,111 @@
/*
Copyright (c) 2025 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <rdc/rdc.h>
#include <rdc_lib/RdcEntityCodec.h>
#include <algorithm>
#include <iostream>
#include <string>
#include "common/rdc_utils.h"
rdc_entity_info_t rdc_get_info_from_entity_index(uint32_t entity_index) {
rdc_entity_info_t info;
info.device_type =
(rdc_device_type_t)((entity_index >> RDC_ENTITY_TYPE_SHIFT) & RDC_ENTITY_TYPE_MASK);
info.entity_role =
(rdc_device_role_t)((entity_index >> RDC_ENTITY_ROLE_SHIFT) & RDC_ENTITY_ROLE_MASK);
info.instance_index = (entity_index >> RDC_ENTITY_INSTANCE_SHIFT) & RDC_ENTITY_INSTANCE_MASK;
info.device_index = (entity_index >> RDC_ENTITY_DEVICE_SHIFT) & RDC_ENTITY_DEVICE_MASK;
return info;
}
uint32_t rdc_get_entity_index_from_info(rdc_entity_info_t info) {
uint32_t entity_index = 0;
entity_index |= ((info.device_type & RDC_ENTITY_TYPE_MASK) << RDC_ENTITY_TYPE_SHIFT);
entity_index |= ((info.entity_role & RDC_ENTITY_ROLE_MASK) << RDC_ENTITY_ROLE_SHIFT);
entity_index |= ((info.instance_index & RDC_ENTITY_INSTANCE_MASK) << RDC_ENTITY_INSTANCE_SHIFT);
entity_index |= ((info.device_index & RDC_ENTITY_DEVICE_MASK) << RDC_ENTITY_DEVICE_SHIFT);
return entity_index;
}
bool rdc_is_partition_string(const char* s) {
if (!s || s[0] == '\0') {
return false;
}
if (s[0] != 'g') {
return false;
}
std::string str(s);
size_t dotPos = str.find('.');
if (dotPos == std::string::npos) return false;
if (dotPos <= 1 || dotPos >= str.size() - 1) return false;
std::string gpuPart = str.substr(1, dotPos - 1);
std::string partitionPart = str.substr(dotPos + 1);
if (!std::all_of(gpuPart.begin(), gpuPart.end(), ::isdigit) ||
!std::all_of(partitionPart.begin(), partitionPart.end(), ::isdigit))
return false;
int gpuIndex = std::stoi(gpuPart);
int partitionIndex = std::stoi(partitionPart);
if (gpuIndex < 0 || gpuIndex >= RDC_MAX_NUM_DEVICES) return false;
if (partitionIndex < 0 || partitionIndex >= RDC_MAX_NUM_PARTITIONS) return false;
return true;
}
bool rdc_parse_partition_string(const char* s, uint32_t* physicalGpu, uint32_t* partition) {
if (!s) {
return false;
}
if (!rdc_is_partition_string(s)) {
return false;
}
std::string str(s);
std::string rest = str.substr(1);
size_t pos = rest.find('.');
if (pos == std::string::npos) return false;
std::string gpuStr = rest.substr(0, pos);
std::string partStr = rest.substr(pos + 1);
// Ensure both parts are a number
if (!(!gpuStr.empty() && std::all_of(gpuStr.begin(), gpuStr.end(), ::isdigit)) ||
!(!partStr.empty() && std::all_of(partStr.begin(), partStr.end(), ::isdigit))) {
return false;
}
*physicalGpu = std::stoi(gpuStr);
*partition = std::stoi(partStr);
return true;
}
@@ -28,6 +28,7 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST}
"${SRC_DIR}/RdcConfigSettingsImpl.cc"
"${SRC_DIR}/RdcTelemetryModule.cc"
"${SRC_DIR}/RdcWatchTableImpl.cc"
"${SRC_DIR}/RdcPartitionImpl.cc"
"${SRC_DIR}/SmiUtils.cc")
# TODO: remove all headers? Will just dir be ok after install?
@@ -60,6 +61,7 @@ set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST}
"${INC_DIR}/impl/RdcSmiLib.h"
"${INC_DIR}/impl/RdcTelemetryModule.h"
"${INC_DIR}/impl/RdcWatchTableImpl.h"
"${INC_DIR}/impl/RdcPartitionImpl.h"
"${INC_DIR}/impl/SmiUtils.h")
message("RDC_LIB_INC_LIST=${RDC_LIB_INC_LIST}")
@@ -36,6 +36,7 @@ THE SOFTWARE.
#include "rdc_lib/impl/RdcMetricsUpdaterImpl.h"
#include "rdc_lib/impl/RdcModuleMgrImpl.h"
#include "rdc_lib/impl/RdcNotificationImpl.h"
#include "rdc_lib/impl/RdcPartitionImpl.h"
#include "rdc_lib/impl/RdcPolicyImpl.h"
#include "rdc_lib/impl/RdcTopologyLinkImpl.h"
#include "rdc_lib/impl/RdcWatchTableImpl.h"
@@ -76,7 +77,8 @@ namespace rdc {
const uint32_t METIC_UPDATE_FREQUENCY = 1000; // 1000 microseconds by default
RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode)
: group_settings_(new RdcGroupSettingsImpl()),
: partition_(new RdcPartitionImpl()),
group_settings_(new RdcGroupSettingsImpl(partition_)),
cache_mgr_(new RdcCacheManagerImpl()),
metric_fetcher_(new RdcMetricFetcherImpl()),
rdc_module_mgr_(new RdcModuleMgrImpl(metric_fetcher_)),
@@ -261,9 +263,14 @@ rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_add(rdc_gpu_group_t group_id, uin
if (status != RDC_ST_OK) {
return status;
}
rdc_entity_info_t info = rdc_get_info_from_entity_index(gpu_index);
uint32_t physical_gpu = info.device_index;
bool is_gpu_exist = false;
for (uint32_t i = 0; i < count; i++) {
if (gpu_index_list[i] == gpu_index) {
if (gpu_index_list[i] == physical_gpu) {
is_gpu_exist = true;
break;
}
@@ -527,5 +534,14 @@ rdc_status_t RdcEmbeddedHandler::rdc_config_clear(rdc_gpu_group_t group_id) {
return config_handler_->rdc_config_clear(group_id);
}
rdc_status_t RdcEmbeddedHandler::rdc_get_num_partition(uint32_t index, uint16_t* num_partition) {
return partition_->rdc_get_num_partition_impl(index, num_partition);
}
rdc_status_t RdcEmbeddedHandler::rdc_instance_profile_get(
uint32_t entity_index, rdc_instance_resource_type_t resource_type,
rdc_resource_profile_t* profile) {
return partition_->rdc_instance_profile_get_impl(entity_index, resource_type, profile);
}
} // namespace rdc
} // namespace amd
@@ -23,13 +23,17 @@ THE SOFTWARE.
#include <ctime>
#include "amd_smi/amdsmi.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/impl/RdcPartitionImpl.h"
#include "rdc_lib/impl/SmiUtils.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
RdcGroupSettingsImpl::RdcGroupSettingsImpl() {
RdcGroupSettingsImpl::RdcGroupSettingsImpl(const RdcPartitionPtr& partition)
: partition_(partition) {
// Add the default job stats fields
rdc_field_t job_fields[] = {RDC_FI_GPU_MEMORY_USAGE, RDC_FI_POWER_USAGE, RDC_FI_GPU_CLOCK,
RDC_FI_GPU_UTIL, RDC_FI_PCIE_TX, RDC_FI_PCIE_RX,
@@ -67,23 +71,50 @@ rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_destroy(rdc_gpu_group_t p_rdc_g
rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_add(rdc_gpu_group_t groupId, uint32_t gpu_index) {
std::lock_guard<std::mutex> guard(group_mutex_);
auto ite = gpu_group_.find(groupId);
if (ite != gpu_group_.end()) {
// Check whether the index already exists
for (uint32_t i = 0; i < ite->second.count; i++) {
if (ite->second.entity_ids[i] == gpu_index) {
RDC_LOG(RDC_INFO, "Fail to add " << gpu_index << " to GPU group " << groupId
<< " as it is already exists");
if (ite == gpu_group_.end()) {
return RDC_ST_NOT_FOUND;
}
rdc_entity_info_t entity_info = rdc_get_info_from_entity_index(gpu_index);
uint16_t num_partitions = 0;
rdc_status_t status =
partition_->rdc_get_num_partition_impl(entity_info.device_index, &num_partitions);
if (status != RDC_ST_OK) {
return status;
}
if (num_partitions != UINT16_MAX && num_partitions > 1) {
if (entity_info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) {
if (entity_info.instance_index >= num_partitions) {
RDC_LOG(RDC_INFO, "Invalid partition instance: GPU "
<< entity_info.device_index << " supports " << num_partitions
<< " partitions, but instance index is "
<< entity_info.instance_index);
return RDC_ST_BAD_PARAMETER;
}
}
if (ite->second.count < RDC_GROUP_MAX_ENTITIES) {
ite->second.entity_ids[ite->second.count] = gpu_index;
ite->second.count++;
} else {
return RDC_ST_MAX_LIMIT;
}
} else {
return RDC_ST_NOT_FOUND;
if (entity_info.entity_role != RDC_DEVICE_ROLE_PHYSICAL) {
RDC_LOG(RDC_INFO, "GPU " << entity_info.device_index
<< " is not partitionable, but a partition instance was provided.");
return RDC_ST_BAD_PARAMETER;
}
}
// Check whether the index already exists
for (uint32_t i = 0; i < ite->second.count; i++) {
if (ite->second.entity_ids[i] == gpu_index) {
RDC_LOG(RDC_INFO, "Fail to add " << gpu_index << " to GPU group " << groupId
<< " as it is already exists");
return RDC_ST_BAD_PARAMETER;
}
}
if (ite->second.count < RDC_GROUP_MAX_ENTITIES) {
ite->second.entity_ids[ite->second.count] = gpu_index;
ite->second.count++;
} else {
return RDC_ST_MAX_LIMIT;
}
return RDC_ST_OK;
@@ -471,9 +471,18 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
amdsmi_processor_handle processor_handle = {};
amdsmi_status_t ret = get_processor_handle_from_id(gpu_index, &processor_handle);
rdc_entity_info_t info = rdc_get_info_from_entity_index(gpu_index);
amdsmi_status_t ret = get_processor_handle_from_id(info.device_index, &processor_handle);
if (ret != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "Failed to get processor handle for GPU " << gpu_index << " error: " << ret);
std::string info_str;
if (info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) {
info_str =
"g" + std::to_string(info.device_index) + "." + std::to_string(info.instance_index);
} else {
info_str = std::to_string(info.device_index);
}
RDC_LOG(RDC_ERROR, "Failed to get processor handle for GPU " << info_str << " error: " << ret);
return Smi2RdcError(ret);
}
@@ -486,6 +495,138 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
value->field_id = field_id;
value->status = AMDSMI_STATUS_NOT_SUPPORTED;
if (info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) {
uint16_t num_partitions = 0;
amdsmi_status_t st = get_num_partition(info.device_index, &num_partitions);
if (st != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "Failed to get partition info for GPU " << info.device_index);
return RDC_ST_UNKNOWN_ERROR;
}
amdsmi_processor_handle processor_handle = {};
amdsmi_status_t ret = get_processor_handle_from_id(gpu_index, &processor_handle);
if (ret != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "Cannot get processor handle for partition " << info.instance_index);
return Smi2RdcError(ret);
}
amdsmi_gpu_metrics_t gpu_metrics = {};
ret = amdsmi_get_gpu_metrics_info(processor_handle, &gpu_metrics);
if (ret != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "Failed to get GPU metrics info for partition " << info.instance_index);
return Smi2RdcError(ret);
}
switch (field_id) {
case RDC_FI_GPU_CLOCK: {
const uint16_t* clock_array = gpu_metrics.current_gfxclks;
std::vector<uint16_t> valid_clocks;
valid_clocks.reserve(8);
for (uint32_t i = 0; i < 8; i++) {
uint16_t clk = clock_array[i];
if (clk != 0 && clk != 0xFFFF) {
valid_clocks.push_back(clk);
}
}
uint32_t vc = static_cast<uint32_t>(valid_clocks.size());
uint32_t pCount = static_cast<uint32_t>(num_partitions);
uint32_t partIdx = info.instance_index;
if (valid_clocks.empty() || vc < num_partitions) {
RDC_LOG(RDC_ERROR, "No valid clocks, or less than total partitions");
return RDC_ST_NO_DATA;
}
if (vc == num_partitions) {
value->value.l_int = clock_array[info.instance_index] * 1000000;
value->type = INTEGER;
value->status = RDC_ST_OK;
return RDC_ST_OK;
}
uint32_t chunk_size = vc / pCount;
uint32_t start_idx = partIdx * chunk_size;
uint32_t end_idx = start_idx + chunk_size;
// Average partition clocks
uint64_t sum = 0;
for (uint32_t i = start_idx; i < end_idx; i++) {
sum += valid_clocks[i];
}
uint32_t count = end_idx - start_idx;
if (count == 0) {
return RDC_ST_NO_DATA;
}
uint64_t avg_clock = sum / count;
value->value.l_int = avg_clock * 1000000;
value->type = INTEGER;
value->status = RDC_ST_OK;
return RDC_ST_OK;
}
case RDC_FI_GPU_UTIL: {
uint32_t p = info.instance_index;
if (p >= AMDSMI_MAX_NUM_XCP) {
return RDC_ST_NO_DATA;
}
const amdsmi_gpu_xcp_metrics_t& xcp = gpu_metrics.xcp_stats[p];
uint64_t sum = 0;
uint32_t count = 0;
for (uint32_t i = 0; i < AMDSMI_MAX_NUM_XCC; i++) {
uint32_t busy = xcp.gfx_busy_inst[i];
if (busy != UINT32_MAX) {
sum += busy;
count++;
}
}
if (count == 0) {
return RDC_ST_NO_DATA;
}
uint64_t avg_busy = sum / count;
value->value.l_int = avg_busy;
value->type = INTEGER;
value->status = RDC_ST_OK;
return RDC_ST_OK;
}
case RDC_FI_GPU_MM_DEC_UTIL: {
uint32_t p = info.instance_index;
if (p >= AMDSMI_MAX_NUM_XCP) {
return RDC_ST_NO_DATA;
}
const amdsmi_gpu_xcp_metrics_t& xcp = gpu_metrics.xcp_stats[p];
uint64_t sum = 0;
uint32_t count = 0;
for (uint32_t i = 0; i < AMDSMI_MAX_NUM_VCN; i++) {
uint16_t vcn = xcp.vcn_busy[i];
if (vcn != UINT16_MAX) {
sum += vcn;
count++;
}
}
if (count == 0) {
return RDC_ST_NO_DATA;
}
uint64_t avg_decode = sum / count;
value->value.l_int = avg_decode;
value->type = INTEGER;
value->status = RDC_ST_OK;
return RDC_ST_OK;
}
default:
// All other fields => N/A for partition
RDC_LOG(RDC_DEBUG, "Partition " << gpu_index << ": Field " << field_id_string(field_id)
<< " not supported => NO_DATA.");
return RDC_ST_NO_DATA;
}
} // end if partition
auto read_smi_counter = [&](void) {
RdcFieldKey f_key(gpu_index, field_id);
smi_data = get_smi_data(f_key);
@@ -600,12 +741,11 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
break;
}
case RDC_FI_GPU_COUNT: {
uint32_t processor_count = 0;
// amdsmi is initialized in AMDSMI_INIT_AMD_GPUS mode -> returned sockets are GPUs
value->status = get_processor_count(processor_count);
uint32_t socket_count = 0;
value->status = amdsmi_get_socket_handles(&socket_count, nullptr);
value->type = INTEGER;
if (value->status == AMDSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(processor_count);
value->value.l_int = static_cast<int64_t>(socket_count);
}
} break;
case RDC_FI_POWER_USAGE: {
@@ -913,8 +1053,9 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
value->value.l_int = static_cast<int64_t>(pending_page_num);
}
}
} else
} else {
value->status = Smi2RdcError(ret);
}
break;
}
@@ -0,0 +1,117 @@
/*
Copyright (c) 2025 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc_lib/impl/RdcPartitionImpl.h"
#include <stdio.h>
#include <string.h>
#include <iostream>
#include "amd_smi/amdsmi.h"
#include "rdc/rdc.h"
#include "rdc_lib/impl/SmiUtils.h"
namespace amd {
namespace rdc {
rdc_status_t RdcPartitionImpl::rdc_instance_profile_get_impl(
uint32_t entity_index, rdc_instance_resource_type_t resource_type,
rdc_resource_profile_t* profile) {
if (profile == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
profile->partition_resource = 0;
profile->num_partitions_share_resource = 0;
rdc_entity_info_t info = rdc_get_info_from_entity_index(entity_index);
amdsmi_processor_handle proc_handle;
// Get processor handle of socket
amdsmi_status_t ret = get_processor_handle_from_id(info.device_index, &proc_handle);
if (ret != AMDSMI_STATUS_SUCCESS) {
return RDC_ST_UNKNOWN_ERROR;
}
amdsmi_accelerator_partition_profile_config_t config;
memset(&config, 0, sizeof(config));
ret = amdsmi_get_gpu_accelerator_partition_profile_config(proc_handle, &config);
if (ret == AMDSMI_STATUS_NOT_SUPPORTED) {
return RDC_ST_OK;
} else if (ret != AMDSMI_STATUS_SUCCESS) {
return RDC_ST_UNKNOWN_ERROR;
}
amdsmi_accelerator_partition_profile_t active_profile;
memset(&active_profile, 0, sizeof(active_profile));
uint32_t num = 0; // This is unused
ret = amdsmi_get_gpu_accelerator_partition_profile(proc_handle, &active_profile, &num);
if (ret != AMDSMI_STATUS_SUCCESS) {
return RDC_ST_UNKNOWN_ERROR;
}
// If physical device, use profile 0 to get all XCC's/Decoders
uint32_t lookup_id =
(info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) ? active_profile.profile_index : 0;
// Map rdc resource type to smi
amdsmi_accelerator_partition_resource_type_t smi_resource;
switch (resource_type) {
case RDC_ACCELERATOR_XCC:
smi_resource = AMDSMI_ACCELERATOR_XCC;
break;
case RDC_ACCELERATOR_DECODER:
smi_resource = AMDSMI_ACCELERATOR_DECODER;
break;
default:
return RDC_ST_NOT_SUPPORTED;
}
bool found = false;
uint32_t total_resource = 0;
uint32_t resource_share = 0;
for (uint32_t i = 0; i < AMDSMI_MAX_CP_PROFILE_RESOURCES; i++) {
const auto& res = config.resource_profiles[i];
if (res.profile_index == lookup_id && res.resource_type == smi_resource) {
total_resource = res.partition_resource;
resource_share = res.num_partitions_share_resource;
found = true;
break;
}
}
if (!found) {
return RDC_ST_UNKNOWN_ERROR;
}
profile->partition_resource = total_resource;
profile->num_partitions_share_resource = resource_share;
return RDC_ST_OK;
}
rdc_status_t RdcPartitionImpl::rdc_get_num_partition_impl(uint32_t index, uint16_t* num_partition) {
if (get_num_partition(index, num_partition) != AMDSMI_STATUS_SUCCESS) {
return RDC_ST_UNKNOWN_ERROR;
}
return RDC_ST_OK;
}
} // namespace rdc
} // namespace amd
@@ -34,8 +34,8 @@ THE SOFTWARE.
#include "rdc/rdc.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/impl/RdcMetricFetcherImpl.h"
#include "rdc_lib/rdc_common.h"
#include "rdc_lib/impl/SmiUtils.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
@@ -209,6 +209,42 @@ rdc_status_t RdcWatchTableImpl::rdc_field_watch(rdc_gpu_group_t group_id,
return result;
}
// Check for rocprof fields in partitions
rdc_group_info_t ginfo;
result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo);
if (result != RDC_ST_OK) {
return result;
}
bool groupHasPartition = false;
for (unsigned int i = 0; i < ginfo.count; i++) {
uint32_t entityId = ginfo.entity_ids[i];
rdc_entity_info_t info = rdc_get_info_from_entity_index(entityId);
if (info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) {
groupHasPartition = true;
break;
}
}
rdc_field_group_info_t field_info;
result = group_settings_->rdc_group_field_get_info(field_group_id, &field_info);
if (result != RDC_ST_OK) {
return result;
}
bool groupHasRocprof = false;
if (result == RDC_ST_OK) {
for (unsigned int i = 0; i < field_info.count; i++) {
rdc_field_t fid = field_info.field_ids[i];
if (fid >= 800 && fid < 900) { // Rocprof fields in the 800's
groupHasRocprof = true;
break;
}
}
}
if (groupHasPartition && groupHasRocprof) {
return RDC_ST_NOT_SUPPORTED;
}
// See if any of the fields are notification fields, and
// set them up, if so.
result = notifications_->set_listen_events(fields_in_watch);
@@ -381,30 +417,30 @@ rdc_status_t RdcWatchTableImpl::create_health_field_group(unsigned int component
// set filed ids
std::vector<rdc_field_t> field_ids{};
if (components & RDC_HEALTH_WATCH_PCIE) {
field_ids.push_back(RDC_HEALTH_PCIE_REPLAY_COUNT);
field_ids.push_back(RDC_HEALTH_PCIE_REPLAY_COUNT);
}
if (components & RDC_HEALTH_WATCH_XGMI) {
field_ids.push_back(RDC_HEALTH_XGMI_ERROR);
field_ids.push_back(RDC_HEALTH_XGMI_ERROR);
}
if (components & RDC_HEALTH_WATCH_MEM) {
field_ids.push_back(RDC_FI_ECC_UNCORRECT_TOTAL);
field_ids.push_back(RDC_HEALTH_RETIRED_PAGE_NUM);
field_ids.push_back(RDC_HEALTH_PENDING_PAGE_NUM);
field_ids.push_back(RDC_HEALTH_RETIRED_PAGE_LIMIT);
field_ids.push_back(RDC_FI_ECC_UNCORRECT_TOTAL);
field_ids.push_back(RDC_HEALTH_RETIRED_PAGE_NUM);
field_ids.push_back(RDC_HEALTH_PENDING_PAGE_NUM);
field_ids.push_back(RDC_HEALTH_RETIRED_PAGE_LIMIT);
}
if (components & RDC_HEALTH_WATCH_EEPROM) {
field_ids.push_back(RDC_HEALTH_EEPROM_CONFIG_VALID);
field_ids.push_back(RDC_HEALTH_EEPROM_CONFIG_VALID);
}
if (components & RDC_HEALTH_WATCH_THERMAL) {
field_ids.push_back(RDC_HEALTH_THERMAL_THROTTLE_TIME);
field_ids.push_back(RDC_HEALTH_THERMAL_THROTTLE_TIME);
}
if (components & RDC_HEALTH_WATCH_POWER) {
field_ids.push_back(RDC_HEALTH_POWER_THROTTLE_TIME);
field_ids.push_back(RDC_HEALTH_POWER_THROTTLE_TIME);
}
if (0 == field_ids.size()) {
@@ -417,8 +453,7 @@ rdc_status_t RdcWatchTableImpl::create_health_field_group(unsigned int component
field_group_name.c_str(), field_group_id);
}
rdc_status_t RdcWatchTableImpl::rdc_health_set(rdc_gpu_group_t group_id,
unsigned int components) {
rdc_status_t RdcWatchTableImpl::rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) {
// remove old health for same group_id
rdc_health_clear(group_id);
@@ -447,13 +482,11 @@ rdc_status_t RdcWatchTableImpl::rdc_health_set(rdc_gpu_group_t group_id,
// get initial values
rdc_field_value value;
result = metric_fetcher_->fetch_smi_field(fields->first, fields->second, &value);
if (result != RDC_ST_OK)
break;
if (result != RDC_ST_OK) break;
// set initial values to cache
result = cache_mgr_->rdc_health_set(group_id, fields->first, value);
if (result != RDC_ST_OK)
break;
if (result != RDC_ST_OK) break;
}
// Start to watch the fields and update fields per 1 second.
@@ -461,10 +494,8 @@ rdc_status_t RdcWatchTableImpl::rdc_health_set(rdc_gpu_group_t group_id,
return result;
}
rdc_status_t RdcWatchTableImpl::rdc_health_get(rdc_gpu_group_t group_id,
unsigned int *components) {
if (nullptr == components)
return RDC_ST_BAD_PARAMETER;
rdc_status_t RdcWatchTableImpl::rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components) {
if (nullptr == components) return RDC_ST_BAD_PARAMETER;
std::lock_guard<std::mutex> guard(watch_mutex_);
auto table_iter = health_watch_table_.find(group_id);
@@ -478,23 +509,19 @@ rdc_status_t RdcWatchTableImpl::rdc_health_get(rdc_gpu_group_t group_id,
return RDC_ST_OK;
}
bool RdcWatchTableImpl::add_health_incident(uint32_t gpu_index,
rdc_health_system_t component,
rdc_health_result_t health,
uint32_t err_code,
std::string err_msg,
rdc_health_incidents_t* incident,
bool RdcWatchTableImpl::add_health_incident(uint32_t gpu_index, rdc_health_system_t component,
rdc_health_result_t health, uint32_t err_code,
std::string err_msg, rdc_health_incidents_t* incident,
rdc_health_response_t* response) {
bool result = false;
incident->gpu_index = gpu_index;
incident->component = component;
incident->health = health;
incident->gpu_index = gpu_index;
incident->component = component;
incident->health = health;
incident->error.code = err_code;
strncpy_with_null(incident->error.msg, err_msg.c_str(), MAX_HEALTH_MSG_LENGTH);
if (incident->health > response->overall_health)
response->overall_health = incident->health;
if (incident->health > response->overall_health) response->overall_health = incident->health;
response->incidents_count++;
if (response->incidents_count >= HEALTH_MAX_ERROR_ITEMS) {
RDC_LOG(RDC_INFO, "Health incidents are full!");
@@ -504,24 +531,20 @@ bool RdcWatchTableImpl::add_health_incident(uint32_t gpu_index,
return (result);
}
rdc_status_t RdcWatchTableImpl::get_start_end_values(rdc_gpu_group_t group_id,
uint32_t gpu_index,
rdc_field_t field,
uint64_t start_timestamp,
rdc_field_value *start_value,
rdc_field_value *end_value) {
if ((nullptr == start_value) && (nullptr == end_value))
return RDC_ST_BAD_PARAMETER;
rdc_status_t RdcWatchTableImpl::get_start_end_values(rdc_gpu_group_t group_id, uint32_t gpu_index,
rdc_field_t field, uint64_t start_timestamp,
rdc_field_value* start_value,
rdc_field_value* end_value) {
if ((nullptr == start_value) && (nullptr == end_value)) return RDC_ST_BAD_PARAMETER;
rdc_status_t result = RDC_ST_OK;
if (nullptr != start_value) {
//get the values of the field at the start_timestamp/end_timestampe
result = cache_mgr_->rdc_health_get_values(group_id,
gpu_index, field,
start_timestamp, 0,
start_value, nullptr);
// get the values of the field at the start_timestamp/end_timestampe
result = cache_mgr_->rdc_health_get_values(group_id, gpu_index, field, start_timestamp, 0,
start_value, nullptr);
if (result != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Error get gpu: " << gpu_index << " field: " << field << " history data. Return: " << result);
RDC_LOG(RDC_ERROR, "Error get gpu: " << gpu_index << " field: " << field
<< " history data. Return: " << result);
return result;
}
}
@@ -529,30 +552,25 @@ rdc_status_t RdcWatchTableImpl::get_start_end_values(rdc_gpu_group_t group_id,
// get end values
result = metric_fetcher_->fetch_smi_field(gpu_index, field, end_value);
if (result != RDC_ST_OK)
RDC_LOG(RDC_ERROR, "Error get gpu: " << gpu_index << " field: " << field << " current data. Return: " << result);
RDC_LOG(RDC_ERROR, "Error get gpu: " << gpu_index << " field: " << field
<< " current data. Return: " << result);
return result;
}
rdc_status_t RdcWatchTableImpl::pcie_check(rdc_gpu_group_t group_id,
uint32_t gpu_index,
rdc_status_t RdcWatchTableImpl::pcie_check(rdc_gpu_group_t group_id, uint32_t gpu_index,
rdc_health_response_t* response) {
//get field start/end values
// get field start/end values
rdc_field_value start = {}, end = {};
uint64_t start_timestamp = static_cast<uint64_t>(time(nullptr) - 60) * 1000;
//get the history data last 1 minute
rdc_status_t result = get_start_end_values(group_id,
gpu_index,
RDC_HEALTH_PCIE_REPLAY_COUNT,
start_timestamp,
&start,
&end);
if (result != RDC_ST_OK)
return result;
// get the history data last 1 minute
rdc_status_t result = get_start_end_values(group_id, gpu_index, RDC_HEALTH_PCIE_REPLAY_COUNT,
start_timestamp, &start, &end);
if (result != RDC_ST_OK) return result;
uint64_t pcie_replay_count = end.value.l_int - start.value.l_int;
if (pcie_replay_count > PCIE_MAX_REPLAYS_PERMIN) {
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
rdc_health_incidents_t* incident = &response->incidents[response->incidents_count];
std::string err_msg = "Detected ";
err_msg += std::to_string(pcie_replay_count);
@@ -560,37 +578,26 @@ rdc_status_t RdcWatchTableImpl::pcie_check(rdc_gpu_group_t group_id,
err_msg += std::to_string(PCIE_MAX_REPLAYS_PERMIN);
err_msg += ".";
//add incident
if (add_health_incident(gpu_index,
RDC_HEALTH_WATCH_PCIE,
RDC_HEALTH_RESULT_WARN,
RDC_FR_PCI_REPLAY_RATE,
err_msg,
incident,
response))
// add incident
if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_PCIE, RDC_HEALTH_RESULT_WARN,
RDC_FR_PCI_REPLAY_RATE, err_msg, incident, response))
return RDC_ST_MAX_LIMIT;
}
return RDC_ST_OK;
}
rdc_status_t RdcWatchTableImpl::xgmi_check(rdc_gpu_group_t group_id,
uint32_t gpu_index,
rdc_status_t RdcWatchTableImpl::xgmi_check(rdc_gpu_group_t group_id, uint32_t gpu_index,
rdc_health_response_t* response) {
//get field start/end values
// get field start/end values
rdc_field_value end = {};
rdc_status_t result = get_start_end_values(group_id,
gpu_index,
RDC_HEALTH_XGMI_ERROR,
0,
nullptr,
&end);
if (result != RDC_ST_OK)
return result;
rdc_status_t result =
get_start_end_values(group_id, gpu_index, RDC_HEALTH_XGMI_ERROR, 0, nullptr, &end);
if (result != RDC_ST_OK) return result;
amdsmi_xgmi_status_t status = static_cast<amdsmi_xgmi_status_t>(end.value.l_int);
if (AMDSMI_XGMI_STATUS_NO_ERRORS != status) {
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
rdc_health_incidents_t* incident = &response->incidents[response->incidents_count];
uint32_t err_code;
std::string err_msg = "Detected ";
@@ -603,106 +610,68 @@ rdc_status_t RdcWatchTableImpl::xgmi_check(rdc_gpu_group_t group_id,
}
err_msg += ".";
//add incident
if (add_health_incident(gpu_index,
RDC_HEALTH_WATCH_XGMI,
RDC_HEALTH_RESULT_FAIL,
err_code,
err_msg,
incident,
response))
// add incident
if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_XGMI, RDC_HEALTH_RESULT_FAIL, err_code,
err_msg, incident, response))
return RDC_ST_MAX_LIMIT;
}
return RDC_ST_OK;
}
rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id,
uint32_t gpu_index,
rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id, uint32_t gpu_index,
rdc_health_response_t* response) {
//get field start/end values
rdc_field_value start= {}, end = {};
rdc_status_t result = get_start_end_values(group_id,
gpu_index,
RDC_FI_ECC_UNCORRECT_TOTAL,
0,
nullptr,
&end);
if (result != RDC_ST_OK)
return result;
// get field start/end values
rdc_field_value start = {}, end = {};
rdc_status_t result =
get_start_end_values(group_id, gpu_index, RDC_FI_ECC_UNCORRECT_TOTAL, 0, nullptr, &end);
if (result != RDC_ST_OK) return result;
uint64_t ecc_uncorrectable_count = 0;
ecc_uncorrectable_count = end.value.l_int;
if (ecc_uncorrectable_count > 0) {
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
rdc_health_incidents_t* incident = &response->incidents[response->incidents_count];
std::string err_msg = "Detected ";
err_msg += std::to_string(ecc_uncorrectable_count);
err_msg += " uncorrectable ECC error(s) since last GPU reset.";
//add incident
if (add_health_incident(gpu_index,
RDC_HEALTH_WATCH_MEM,
RDC_HEALTH_RESULT_FAIL,
RDC_FR_ECC_UNCORRECTABLE_DETECTED,
err_msg,
incident,
response))
// add incident
if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_MEM, RDC_HEALTH_RESULT_FAIL,
RDC_FR_ECC_UNCORRECTABLE_DETECTED, err_msg, incident, response))
return RDC_ST_MAX_LIMIT;
}
result = get_start_end_values(group_id,
gpu_index,
RDC_HEALTH_PENDING_PAGE_NUM,
0,
nullptr,
&end);
if (result != RDC_ST_OK)
return result;
result = get_start_end_values(group_id, gpu_index, RDC_HEALTH_PENDING_PAGE_NUM, 0, nullptr, &end);
if (result != RDC_ST_OK) return result;
uint64_t num_pages = end.value.l_int;
if (num_pages > 0) {
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
rdc_health_incidents_t* incident = &response->incidents[response->incidents_count];
std::string err_msg = "Detected ";
err_msg += std::to_string(num_pages);
err_msg += " pending retired page(s).";
//add incident
if (add_health_incident(gpu_index,
RDC_HEALTH_WATCH_MEM,
RDC_HEALTH_RESULT_WARN,
RDC_FR_PENDING_PAGE_RETIREMENTS,
err_msg,
incident,
response))
// add incident
if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_MEM, RDC_HEALTH_RESULT_WARN,
RDC_FR_PENDING_PAGE_RETIREMENTS, err_msg, incident, response))
return RDC_ST_MAX_LIMIT;
}
//get retired page number
result = get_start_end_values(group_id,
gpu_index,
RDC_HEALTH_RETIRED_PAGE_NUM,
0,
nullptr,
&end);
if (result != RDC_ST_OK)
return result;
// get retired page number
result = get_start_end_values(group_id, gpu_index, RDC_HEALTH_RETIRED_PAGE_NUM, 0, nullptr, &end);
if (result != RDC_ST_OK) return result;
uint64_t retired_page = end.value.l_int;
//get retired page threshold
result = get_start_end_values(group_id,
gpu_index,
RDC_HEALTH_RETIRED_PAGE_LIMIT,
0,
nullptr,
&end);
if (result != RDC_ST_OK)
return result;
// get retired page threshold
result =
get_start_end_values(group_id, gpu_index, RDC_HEALTH_RETIRED_PAGE_LIMIT, 0, nullptr, &end);
if (result != RDC_ST_OK) return result;
uint32_t retired_page_threshold = end.value.l_int;
if (retired_page > retired_page_threshold) {
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
rdc_health_incidents_t* incident = &response->incidents[response->incidents_count];
std::string err_msg = "Detected ";
err_msg += std::to_string(retired_page);
@@ -710,14 +679,9 @@ rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id,
err_msg += std::to_string(retired_page_threshold);
err_msg += ".";
//add incident
if (add_health_incident(gpu_index,
RDC_HEALTH_WATCH_MEM,
RDC_HEALTH_RESULT_FAIL,
RDC_FR_RETIRED_PAGES_LIMIT,
err_msg,
incident,
response))
// add incident
if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_MEM, RDC_HEALTH_RESULT_FAIL,
RDC_FR_RETIRED_PAGES_LIMIT, err_msg, incident, response))
return RDC_ST_MAX_LIMIT;
return RDC_ST_OK;
@@ -725,31 +689,22 @@ rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id,
if (retired_page > 0) {
uint64_t start_timestamp = static_cast<uint64_t>(time(nullptr) - 604800) * 1000;
//get retired page number last 1 week
result = get_start_end_values(group_id,
gpu_index,
RDC_HEALTH_RETIRED_PAGE_NUM,
start_timestamp,
&start,
&end);
if (result != RDC_ST_OK)
return result;
// get retired page number last 1 week
result = get_start_end_values(group_id, gpu_index, RDC_HEALTH_RETIRED_PAGE_NUM, start_timestamp,
&start, &end);
if (result != RDC_ST_OK) return result;
retired_page = end.value.l_int - start.value.l_int;
if (retired_page > 1) {
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
rdc_health_incidents_t* incident = &response->incidents[response->incidents_count];
std::string err_msg = "Detected ";
err_msg += std::to_string(retired_page);
err_msg += " retired pages more than one in the last week.";
//add incident
if (add_health_incident(gpu_index,
RDC_HEALTH_WATCH_MEM,
RDC_HEALTH_RESULT_FAIL,
RDC_FR_RETIRED_PAGES_UNCORRECTABLE_LIMIT,
err_msg,
incident,
// add incident
if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_MEM, RDC_HEALTH_RESULT_FAIL,
RDC_FR_RETIRED_PAGES_UNCORRECTABLE_LIMIT, err_msg, incident,
response))
return RDC_ST_MAX_LIMIT;
}
@@ -758,194 +713,150 @@ rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id,
return RDC_ST_OK;
}
rdc_status_t RdcWatchTableImpl::eeprom_check(rdc_gpu_group_t group_id,
uint32_t gpu_index,
rdc_status_t RdcWatchTableImpl::eeprom_check(rdc_gpu_group_t group_id, uint32_t gpu_index,
rdc_health_response_t* response) {
rdc_field_value end = {};
rdc_status_t result = get_start_end_values(group_id,
gpu_index,
RDC_FI_ECC_UNCORRECT_TOTAL,
0,
nullptr,
&end);
if (result != RDC_ST_OK && result != RDC_ST_CORRUPTED_EEPROM)
return result;
rdc_status_t result =
get_start_end_values(group_id, gpu_index, RDC_FI_ECC_UNCORRECT_TOTAL, 0, nullptr, &end);
if (result != RDC_ST_OK && result != RDC_ST_CORRUPTED_EEPROM) return result;
if (result == RDC_ST_CORRUPTED_EEPROM) {
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
rdc_health_incidents_t* incident = &response->incidents[response->incidents_count];
std::string err_msg = "Detected a corrupt EEPROM since last GPU reset.";
//add incident
if (add_health_incident(gpu_index,
RDC_HEALTH_WATCH_EEPROM,
RDC_HEALTH_RESULT_WARN,
RDC_FR_CORRUPT_EEPROM,
err_msg,
incident,
response))
// add incident
if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_EEPROM, RDC_HEALTH_RESULT_WARN,
RDC_FR_CORRUPT_EEPROM, err_msg, incident, response))
return RDC_ST_MAX_LIMIT;
}
return RDC_ST_OK;
}
rdc_status_t RdcWatchTableImpl::thermal_check(rdc_gpu_group_t group_id,
uint32_t gpu_index,
rdc_status_t RdcWatchTableImpl::thermal_check(rdc_gpu_group_t group_id, uint32_t gpu_index,
rdc_health_response_t* response) {
//get field start/end values
// get field start/end values
rdc_field_value start = {}, end = {};
uint64_t start_timestamp = static_cast<uint64_t>(time(nullptr) - 60) * 1000;
//get the history data last 1 minute
rdc_status_t result = get_start_end_values(group_id,
gpu_index,
RDC_HEALTH_THERMAL_THROTTLE_TIME,
start_timestamp,
&start,
&end);
if (result != RDC_ST_OK)
return result;
// get the history data last 1 minute
rdc_status_t result = get_start_end_values(group_id, gpu_index, RDC_HEALTH_THERMAL_THROTTLE_TIME,
start_timestamp, &start, &end);
if (result != RDC_ST_OK) return result;
uint64_t acc_socket_thrm = end.value.l_int - start.value.l_int;
if (0 < acc_socket_thrm) {
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
rdc_health_incidents_t* incident = &response->incidents[response->incidents_count];
std::string err_msg = "Detected ";
err_msg += std::to_string(acc_socket_thrm);
err_msg += " clock throttling due to thermal violation in the last minute.";
//add incident
if (add_health_incident(gpu_index,
RDC_HEALTH_WATCH_THERMAL,
RDC_HEALTH_RESULT_WARN,
RDC_FR_CLOCKS_THROTTLE_THERMAL,
err_msg,
incident,
response))
// add incident
if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_THERMAL, RDC_HEALTH_RESULT_WARN,
RDC_FR_CLOCKS_THROTTLE_THERMAL, err_msg, incident, response))
return RDC_ST_MAX_LIMIT;
}
return RDC_ST_OK;
}
rdc_status_t RdcWatchTableImpl::power_check(rdc_gpu_group_t group_id,
uint32_t gpu_index,
rdc_status_t RdcWatchTableImpl::power_check(rdc_gpu_group_t group_id, uint32_t gpu_index,
rdc_health_response_t* response) {
//get field start/end values
// get field start/end values
rdc_field_value start = {}, end = {};
uint64_t start_timestamp = static_cast<uint64_t>(time(nullptr) - 60) * 1000;
//get the history data last 1 minute
rdc_status_t result = get_start_end_values(group_id,
gpu_index,
RDC_HEALTH_POWER_THROTTLE_TIME,
start_timestamp,
&start,
&end);
if (result != RDC_ST_OK)
return result;
// get the history data last 1 minute
rdc_status_t result = get_start_end_values(group_id, gpu_index, RDC_HEALTH_POWER_THROTTLE_TIME,
start_timestamp, &start, &end);
if (result != RDC_ST_OK) return result;
uint64_t acc_ppt_pwr = end.value.l_int - start.value.l_int;
if (0 < acc_ppt_pwr) {
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
rdc_health_incidents_t* incident = &response->incidents[response->incidents_count];
std::string err_msg = "Detected ";
err_msg += std::to_string(acc_ppt_pwr);
err_msg += " Detected clock throttling due to power violation in the last minute.";
//add incident
if (add_health_incident(gpu_index,
RDC_HEALTH_WATCH_POWER,
RDC_HEALTH_RESULT_WARN,
RDC_FR_CLOCKS_THROTTLE_POWER,
err_msg,
incident,
response))
// add incident
if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_POWER, RDC_HEALTH_RESULT_WARN,
RDC_FR_CLOCKS_THROTTLE_POWER, err_msg, incident, response))
return RDC_ST_MAX_LIMIT;
}
return RDC_ST_OK;
}
rdc_status_t RdcWatchTableImpl::rdc_health_check(rdc_gpu_group_t group_id,
rdc_health_response_t *response) {
if (nullptr == response)
return RDC_ST_BAD_PARAMETER;
rdc_health_response_t* response) {
if (nullptr == response) return RDC_ST_BAD_PARAMETER;
unsigned int components = 0;
std::vector<RdcFieldKey> fields_in_watch;
do { //< lock guard for thread safe
std::lock_guard<std::mutex> guard(watch_mutex_);
auto health = health_watch_table_.find(group_id);
if (health == health_watch_table_.end())
return RDC_ST_NOT_FOUND;
if (health == health_watch_table_.end()) return RDC_ST_NOT_FOUND;
components = health->second.components;
fields_in_watch = health->second.fields;
} while (0);
rdc_group_info_t ginfo;
rdc_status_t result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo);
if (result != RDC_ST_OK)
return result;
if (result != RDC_ST_OK) return result;
for (auto fields = fields_in_watch.begin(); fields != fields_in_watch.end(); fields++) {
// get current values
rdc_field_value value;
result = metric_fetcher_->fetch_smi_field(fields->first, fields->second, &value);
if (result != RDC_ST_OK)
break;
if (result != RDC_ST_OK) break;
// set current values to cache
result = cache_mgr_->rdc_update_health_stats(group_id, fields->first, value);
if (result != RDC_ST_OK)
break;
if (result != RDC_ST_OK) break;
}
//init response
// init response
response->overall_health = RDC_HEALTH_RESULT_PASS;
response->incidents_count = 0;
for (uint32_t gindex = 0; gindex < ginfo.count; gindex++) {
//PCIe
// PCIe
if (components & RDC_HEALTH_WATCH_PCIE) {
result = pcie_check(group_id, ginfo.entity_ids[gindex], response);
if (result == RDC_ST_MAX_LIMIT)
return result;
if (result == RDC_ST_MAX_LIMIT) return result;
}
//XGMI
// XGMI
if (components & RDC_HEALTH_WATCH_XGMI) {
result = xgmi_check(group_id, ginfo.entity_ids[gindex], response);
if (result == RDC_ST_MAX_LIMIT)
return result;
if (result == RDC_ST_MAX_LIMIT) return result;
}
//Memory
// Memory
if (components & RDC_HEALTH_WATCH_MEM) {
result = memory_check(group_id, ginfo.entity_ids[gindex], response);
if (result == RDC_ST_MAX_LIMIT)
return result;
if (result == RDC_ST_MAX_LIMIT) return result;
}
//EEPROM
// EEPROM
if (components & RDC_HEALTH_WATCH_EEPROM) {
result = eeprom_check(group_id, ginfo.entity_ids[gindex], response);
if (result == RDC_ST_MAX_LIMIT)
return result;
if (result == RDC_ST_MAX_LIMIT) return result;
}
//Thermal
// Thermal
if (components & RDC_HEALTH_WATCH_THERMAL) {
result = thermal_check(group_id, ginfo.entity_ids[gindex], response);
if (result == RDC_ST_MAX_LIMIT)
return result;
if (result == RDC_ST_MAX_LIMIT) return result;
}
//Power
// Power
if (components & RDC_HEALTH_WATCH_POWER) {
result = power_check(group_id, ginfo.entity_ids[gindex], response);
if (result == RDC_ST_MAX_LIMIT)
return result;
if (result == RDC_ST_MAX_LIMIT) return result;
}
} //end of for gindex
} // end of for gindex
return RDC_ST_OK;
}
@@ -953,7 +864,7 @@ rdc_status_t RdcWatchTableImpl::rdc_health_check(rdc_gpu_group_t group_id,
rdc_status_t RdcWatchTableImpl::rdc_health_clear(rdc_gpu_group_t group_id) {
rdc_field_grp_t field_group_id;
do { //< lock guard for thread safe
do { //< lock guard for thread safe
std::lock_guard<std::mutex> guard(watch_mutex_);
auto health = health_watch_table_.find(group_id);
if (health == health_watch_table_.end()) {
@@ -1219,8 +1130,8 @@ void RdcWatchTableImpl::debug_status() {
for (const auto& p : hite->second.fields) {
strstream << "<" << p.first << "," << p.second << "> ";
}
RDC_LOG(RDC_DEBUG,
"group id : " << hite->first << " components : " << hite->second.components << " fields : " << strstream.str());
RDC_LOG(RDC_DEBUG, "group id : " << hite->first << " components : " << hite->second.components
<< " fields : " << strstream.str());
}
if (fields_to_watch_.size() > 0) {
+108 -28
Datei anzeigen
@@ -23,6 +23,7 @@ THE SOFTWARE.
#include "rdc_lib/impl/SmiUtils.h"
#include <cstdint>
#include <cstring>
#include <vector>
#include "amd_smi/amdsmi.h"
@@ -79,44 +80,59 @@ rdc_status_t Smi2RdcError(amdsmi_status_t rsmi) {
amdsmi_status_t get_processor_handle_from_id(uint32_t gpu_id,
amdsmi_processor_handle* processor_handle) {
uint32_t socket_count;
uint32_t processor_count;
auto ret = amdsmi_get_socket_handles(&socket_count, nullptr);
uint32_t socket_count = 0;
amdsmi_status_t ret = amdsmi_get_socket_handles(&socket_count, nullptr);
if (ret != AMDSMI_STATUS_SUCCESS) {
return ret;
}
std::vector<amdsmi_socket_handle> sockets(socket_count);
std::vector<amdsmi_processor_handle> all_processors{};
ret = amdsmi_get_socket_handles(&socket_count, sockets.data());
for (auto& socket : sockets) {
ret = amdsmi_get_processor_handles(socket, &processor_count, nullptr);
if (ret != AMDSMI_STATUS_SUCCESS) {
return ret;
}
std::vector<amdsmi_processor_handle> processors(processor_count);
ret = amdsmi_get_processor_handles(socket, &processor_count, processors.data());
if (ret != AMDSMI_STATUS_SUCCESS) {
return ret;
}
for (auto& processor : processors) {
processor_type_t processor_type = {};
ret = amdsmi_get_processor_type(processor, &processor_type);
if (processor_type != AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
RDC_LOG(RDC_ERROR, "Expect AMD_GPU device type!");
return AMDSMI_STATUS_NOT_SUPPORTED;
}
all_processors.push_back(processor);
}
std::vector<amdsmi_socket_handle> sockets(socket_count);
ret = amdsmi_get_socket_handles(&socket_count, sockets.data());
if (ret != AMDSMI_STATUS_SUCCESS) {
return ret;
}
if (gpu_id >= all_processors.size()) {
std::vector<std::vector<amdsmi_processor_handle>> procs_by_socket;
procs_by_socket.resize(socket_count);
for (size_t s = 0; s < sockets.size(); s++) {
uint32_t proc_count = 0;
ret = amdsmi_get_processor_handles(sockets[s], &proc_count, nullptr);
if (ret != AMDSMI_STATUS_SUCCESS) {
return ret;
}
std::vector<amdsmi_processor_handle> procs(proc_count);
ret = amdsmi_get_processor_handles(sockets[s], &proc_count, procs.data());
if (ret != AMDSMI_STATUS_SUCCESS) {
return ret;
}
for (auto& proc : procs) {
processor_type_t proc_type = {};
ret = amdsmi_get_processor_type(proc, &proc_type);
if (proc_type != AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
}
procs_by_socket[s] = procs;
}
rdc_entity_info_t info = rdc_get_info_from_entity_index(gpu_id);
uint32_t socket_index = info.device_index;
uint32_t instance_index = info.instance_index;
if (socket_index >= procs_by_socket.size()) {
return AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS;
}
// Get processor handle from GPU id
*processor_handle = all_processors[gpu_id];
const auto& handles = procs_by_socket[socket_index];
if (instance_index >= handles.size()) {
return AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS;
}
*processor_handle = handles[instance_index];
return AMDSMI_STATUS_SUCCESS;
}
@@ -141,5 +157,69 @@ amdsmi_status_t get_processor_count(uint32_t& all_processor_count) {
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t get_socket_handles(std::vector<amdsmi_socket_handle>& sockets) {
uint32_t socket_count = 0;
amdsmi_status_t ret = amdsmi_get_socket_handles(&socket_count, nullptr);
if (ret != AMDSMI_STATUS_SUCCESS) {
return ret;
}
sockets.resize(socket_count);
ret = amdsmi_get_socket_handles(&socket_count, sockets.data());
return ret;
}
amdsmi_status_t get_processor_handles(amdsmi_socket_handle socket,
std::vector<amdsmi_processor_handle>& processors) {
uint32_t processor_count = 0;
amdsmi_status_t ret = amdsmi_get_processor_handles(socket, &processor_count, nullptr);
if (ret != AMDSMI_STATUS_SUCCESS) {
return ret;
}
processors.resize(processor_count);
ret = amdsmi_get_processor_handles(socket, &processor_count, processors.data());
return ret;
}
amdsmi_status_t get_kfd_partition_id(amdsmi_processor_handle proc, uint32_t* partition_id) {
amdsmi_kfd_info_t kfd_info = {};
amdsmi_status_t ret = amdsmi_get_gpu_kfd_info(proc, &kfd_info);
if (ret != AMDSMI_STATUS_SUCCESS) {
return ret;
}
*partition_id = kfd_info.current_partition_id;
return ret;
}
amdsmi_status_t get_metrics_info(amdsmi_processor_handle proc, amdsmi_gpu_metrics_t* metrics) {
amdsmi_status_t ret = amdsmi_get_gpu_metrics_info(proc, metrics);
return ret;
}
amdsmi_status_t get_num_partition(uint32_t index, uint16_t* num_partition) {
// Get the processor handle for the physical device.
amdsmi_processor_handle proc_handle;
amdsmi_status_t ret = get_processor_handle_from_id(index, &proc_handle);
if (ret != AMDSMI_STATUS_SUCCESS) {
return ret;
}
amdsmi_gpu_metrics_t metrics;
memset(&metrics, 0, sizeof(metrics));
ret = get_metrics_info(proc_handle, &metrics);
if (ret != AMDSMI_STATUS_SUCCESS) {
return ret;
}
*num_partition = metrics.num_partition;
return ret;
}
} // namespace rdc
} // namespace amd
@@ -1075,5 +1075,41 @@ rdc_status_t RdcStandaloneHandler::rdc_link_status_get(rdc_link_status_t* result
return RDC_ST_OK;
}
rdc_status_t RdcStandaloneHandler::rdc_get_num_partition(uint32_t index, uint16_t* num_partition) {
::rdc::GetNumPartitionRequest request;
request.set_gpu_index(index);
::rdc::GetNumPartitionResponse reply;
::grpc::ClientContext context;
::grpc::Status status = stub_->GetNumPartition(&context, request, &reply);
rdc_status_t err_status = error_handle(status, reply.status());
if (err_status != RDC_ST_OK) {
return err_status;
}
*num_partition = reply.num_partition();
return RDC_ST_OK;
}
rdc_status_t RdcStandaloneHandler::rdc_instance_profile_get(
uint32_t entity_index, rdc_instance_resource_type_t resource_type,
rdc_resource_profile_t* profile) {
::rdc::GetInstanceProfileRequest request;
request.set_entity_index(entity_index);
request.set_resource_type(static_cast<uint32_t>(resource_type));
::rdc::GetInstanceProfileResponse reply;
::grpc::ClientContext context;
::grpc::Status status = stub_->GetInstanceProfile(&context, request, &reply);
rdc_status_t err_status = error_handle(status, reply.status());
if (err_status != RDC_ST_OK) {
return err_status;
}
profile->partition_resource = reply.partition_resource();
profile->num_partitions_share_resource = reply.num_partitions_share_resource();
return RDC_ST_OK;
}
} // namespace rdc
} // namespace amd
+2 -1
Datei anzeigen
@@ -56,7 +56,8 @@ set(INC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/include")
include_directories(${INC_DIR} ${PROJECT_SOURCE_DIR}/include
"${GRPC_ROOT}/include"
${PROJECT_SOURCE_DIR})
${PROJECT_SOURCE_DIR}
${AMD_SMI_INCLUDE_DIR})
set(RDCI_SRC_LIST
"${COMMON_DIR}/rdc_fields_supported.cc"
@@ -37,7 +37,9 @@ class RdciDiscoverySubSystem : public RdciSubSystem {
bool show_help_;
void show_help() const;
bool is_list_;
bool is_partition_;
void show_attributes();
void show_attributes_with_partitions();
bool show_version_;
void show_version();
};
@@ -43,6 +43,9 @@ class RdciDmonSubSystem : public RdciSubSystem {
void show_field_usage() const;
void clean_up();
// Need to resolve gpu indexes after process is called
void resolve_gpu_indexes();
void create_temp_group();
void create_temp_field_group();
@@ -64,6 +67,7 @@ class RdciDmonSubSystem : public RdciSubSystem {
std::map<OPTIONS, uint32_t> options_;
std::vector<rdc_field_t> field_ids_;
std::string raw_gpu_indexes_;
std::vector<uint32_t> gpu_indexes_;
bool need_cleanup_;
uint64_t latest_time_stamp_;
+173 -16
Datei anzeigen
@@ -24,6 +24,10 @@ THE SOFTWARE.
#include <getopt.h>
#include <unistd.h>
#include <cstring>
#include <iomanip>
#include <set>
#include "rdc/rdc.h"
#include "rdc/rdc_private.h"
#include "rdc_lib/RdcException.h"
@@ -33,22 +37,23 @@ namespace amd {
namespace rdc {
RdciDiscoverySubSystem::RdciDiscoverySubSystem()
: show_help_(false),
is_list_(false),
show_version_(false) {}
: show_help_(false), is_list_(false), is_partition_(false), show_version_(false) {}
void RdciDiscoverySubSystem::parse_cmd_opts(int argc, char** argv) {
const int HOST_OPTIONS = 1000;
const int JSON_OPTIONS = 1001;
const struct option long_options[] = {
{"host", required_argument, nullptr, HOST_OPTIONS}, {"help", optional_argument, nullptr, 'h'},
{"unauth", optional_argument, nullptr, 'u'}, {"list", optional_argument, nullptr, 'l'},
{"json", optional_argument, nullptr, JSON_OPTIONS}, {"version", optional_argument, nullptr, 'v'}, {nullptr, 0, nullptr, 0}};
const struct option long_options[] = {{"host", required_argument, nullptr, HOST_OPTIONS},
{"help", optional_argument, nullptr, 'h'},
{"unauth", optional_argument, nullptr, 'u'},
{"list", optional_argument, nullptr, 'l'},
{"json", optional_argument, nullptr, JSON_OPTIONS},
{"version", optional_argument, nullptr, 'v'},
{nullptr, 0, nullptr, 0}};
int option_index = 0;
int opt = 0;
while ((opt = getopt_long(argc, argv, "hluv", long_options, &option_index)) != -1) {
while ((opt = getopt_long(argc, argv, "hliuv", long_options, &option_index)) != -1) {
switch (opt) {
case HOST_OPTIONS:
ip_port_ = optarg;
@@ -65,6 +70,9 @@ void RdciDiscoverySubSystem::parse_cmd_opts(int argc, char** argv) {
case 'l':
is_list_ = true;
break;
case 'i':
is_partition_ = true;
break;
case 'v':
show_version_ = true;
break;
@@ -74,9 +82,10 @@ void RdciDiscoverySubSystem::parse_cmd_opts(int argc, char** argv) {
}
}
if ((!is_list_ && !show_version_) || (is_list_ && show_version_)) {
int opCount = (is_list_ ? 1 : 0) + (is_partition_ ? 1 : 0) + (show_version_ ? 1 : 0);
if (opCount != 1) {
show_help();
throw RdcException(RDC_ST_BAD_PARAMETER, "Need to specify operations");
throw RdcException(RDC_ST_BAD_PARAMETER, "Need to specify exactly one operation");
}
}
@@ -93,6 +102,8 @@ void RdciDiscoverySubSystem::show_help() const {
<< "Output using json.\n";
std::cout << " -l --list list GPU discovered"
<< " on the system\n";
std::cout << " -i --gpu-instance list GPU discovered"
<< " on the system with partitions\n";
std::cout << " -v --version Display version information of the"
<< " the server and libraries used by the server\n";
}
@@ -108,7 +119,7 @@ void RdciDiscoverySubSystem::show_attributes() {
if (is_json_output()) {
std::cout << "\"gpus\" : [], \"status\": \"ok\"";
} else {
std::cout << "No GPUs find on the system\n";
std::cout << "No GPUs found on the system\n";
}
return;
}
@@ -145,6 +156,145 @@ void RdciDiscoverySubSystem::show_attributes() {
}
}
void RdciDiscoverySubSystem::show_attributes_with_partitions() {
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES];
uint32_t count = 0;
rdc_status_t result = rdc_device_get_all(rdc_handle_, gpu_index_list, &count);
if (result != RDC_ST_OK) {
throw RdcException(result, "Fail to get all devices");
}
if (count == 0) {
if (is_json_output())
std::cout << "\"gpus\" : [], \"status\": \"ok\"";
else
std::cout << "No GPUs found on the system\n";
return;
}
// Print header.
if (!is_json_output()) {
std::cout << count << " GPUs found." << std::endl;
std::cout << "---------------------------------------------------------------------"
<< std::endl;
std::cout << std::setw(12) << std::left << "GPU Index" << std::setw(20) << "Instance Index"
<< std::setw(25) << "Device Information" << std::setw(8) << "XCC" << std::setw(8)
<< "DECODER" << std::endl;
} else {
std::cout << "\"gpus\" : [";
}
// Loop over each GPU.
for (uint32_t i = 0; i < count; i++) {
rdc_device_attributes_t attribute;
result = rdc_device_get_attributes(rdc_handle_, gpu_index_list[i], &attribute);
if (result != RDC_ST_OK) return;
// Build physical device entity info.
rdc_entity_info_t phys_info;
phys_info.device_index = i;
phys_info.instance_index = 0;
phys_info.entity_role = RDC_DEVICE_ROLE_PHYSICAL;
phys_info.device_type = RDC_DEVICE_TYPE_GPU;
uint32_t phys_entity_index = rdc_get_entity_index_from_info(phys_info);
rdc_resource_profile_t phys_xcc = {};
rdc_resource_profile_t phys_decoder_profile = {};
result =
rdc_instance_profile_get(rdc_handle_, phys_entity_index, RDC_ACCELERATOR_XCC, &phys_xcc);
result = rdc_instance_profile_get(rdc_handle_, phys_entity_index, RDC_ACCELERATOR_DECODER,
&phys_decoder_profile);
std::string phys_xcc_str = std::to_string(phys_xcc.partition_resource);
std::string phys_decoder_str = std::to_string(phys_decoder_profile.partition_resource);
if (!is_json_output()) {
std::cout << std::setw(12) << std::left << i << std::setw(20) << "" << std::setw(25)
<< attribute.device_name << std::setw(8) << phys_xcc_str << std::setw(8)
<< phys_decoder_str << std::endl;
} else {
std::cout << "{\"gpu_index\": \"" << i << "\", "
<< "\"device_name\": \"" << attribute.device_name << "\", "
<< "\"physical\": {"
<< "\"xcc\": \"" << phys_xcc_str << "\", "
<< "\"decoder\": \"" << phys_decoder_str << "\" "
<< "}";
}
uint16_t num_partition = 0;
rdc_status_t result = rdc_get_num_partition(rdc_handle_, i, &num_partition);
if (result != RDC_ST_OK) {
return;
}
// A partitionable device not in partitionable mode will have metrics.num_partition=1
// Where as, a non-partitionable device will have metrics.num_partition = UINT16_MAX
if (num_partition != UINT16_MAX && num_partition > 1) {
if (is_json_output()) {
std::cout << ", \"partitions\": [";
}
for (uint32_t pid = 0; pid < num_partition; pid++) {
std::string instance_str = "g" + std::to_string(i) + "." + std::to_string(pid);
rdc_entity_info_t part_info;
part_info.device_index = i;
part_info.instance_index = pid;
part_info.entity_role = RDC_DEVICE_ROLE_PARTITION_INSTANCE;
part_info.device_type = RDC_DEVICE_TYPE_GPU;
uint32_t part_entity_index = rdc_get_entity_index_from_info(part_info);
rdc_resource_profile_t part_xcc = {};
rdc_resource_profile_t part_decoder = {};
result = rdc_instance_profile_get(rdc_handle_, part_entity_index, RDC_ACCELERATOR_XCC,
&part_xcc);
result = rdc_instance_profile_get(rdc_handle_, part_entity_index, RDC_ACCELERATOR_DECODER,
&part_decoder);
std::string part_decoder_str = std::to_string(part_decoder.partition_resource);
std::string part_xcc_str = std::to_string(part_xcc.partition_resource);
std::string starColumn = " ";
if (part_decoder.num_partitions_share_resource > 1) {
starColumn = "*";
}
if (!is_json_output()) {
std::cout << std::setw(12) << "" << std::setw(20) << instance_str << std::setw(25) << ""
<< std::setw(7) << part_xcc_str << std::setw(1) << starColumn << std::setw(8)
<< part_decoder_str << std::endl;
} else {
std::string decoder_shared =
(part_decoder.num_partitions_share_resource > 1) ? "true" : "false";
std::cout << "{\"instance_index\": \"" << instance_str << "\", "
<< "\"xcc\": \"" << part_xcc_str << "\", "
<< "\"decoder\": \"" << part_decoder_str << "\", "
<< "\"decoder_shared\": " << decoder_shared << "}";
if (pid != num_partition - 1) {
std::cout << ",";
} else {
std::cout << "]";
}
}
}
}
if (is_json_output()) {
if (i != count - 1)
std::cout << "},";
else
std::cout << "}";
}
}
if (!is_json_output()) {
std::cout << "---------------------------------------------------------------------"
<< std::endl;
std::cout << "* if the resource is shared" << std::endl;
} else {
std::cout << ']';
}
}
void RdciDiscoverySubSystem::show_version() {
rdc_component_version_t smiv;
rdc_status_t result = rdc_device_get_component_version(rdc_handle_, RDC_AMDMSI_COMPONENT, &smiv);
@@ -155,18 +305,21 @@ void RdciDiscoverySubSystem::show_version() {
mixed_component_version_t rdcdv;
uint32_t ret = get_mixed_component_version(rdc_handle_, RDCD_COMPONENT, &rdcdv);
if (ret) {
std::cout << "get rdcd version fail"<< std::endl;
std::cout << "get rdcd version fail" << std::endl;
return;
}
if (is_json_output()) {
if (is_json_output()) {
std::cout << "\"version\" : ";
std::cout << '{';
std::cout << "\"rdcd\": " << "\"" << rdcdv.version << "\", ";
std::cout << "\"amdsmi_lib\": " << "\"" << smiv.version << "\"";
std::cout << "\"rdcd\": "
<< "\"" << rdcdv.version << "\", ";
std::cout << "\"amdsmi_lib\": "
<< "\"" << smiv.version << "\"";
std::cout << '}';
} else {
std::cout << "RDCD : " << rdcdv.version << " | " << "AMDSMI Library : " << smiv.version << std::endl;
std::cout << "RDCD : " << rdcdv.version << " | "
<< "AMDSMI Library : " << smiv.version << std::endl;
}
return;
@@ -181,6 +334,10 @@ void RdciDiscoverySubSystem::process() {
return show_attributes();
}
if (is_partition_) {
return show_attributes_with_partitions();
}
if (show_version_) {
return show_version();
}
+94 -12
Datei anzeigen
@@ -26,6 +26,8 @@ THE SOFTWARE.
#include <signal.h>
#include <unistd.h>
#include <algorithm>
#include <cctype>
#include <cmath>
#include <ctime>
#include <iomanip>
@@ -62,6 +64,15 @@ void RdciDmonSubSystem::set_terminating(int sig) {
}
}
std::string entity_to_string(uint32_t entity_index) {
rdc_entity_info_t info = rdc_get_info_from_entity_index(entity_index);
if (info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) {
return "g" + std::to_string(info.device_index) + "." + std::to_string(info.instance_index);
}
return std::to_string(info.device_index);
}
void RdciDmonSubSystem::parse_cmd_opts(int argc, char** argv) {
const int HOST_OPTIONS = 1000;
const int LIST_ALL_FIELDS_OPT = 1001;
@@ -174,15 +185,6 @@ void RdciDmonSubSystem::parse_cmd_opts(int argc, char** argv) {
if (gpu_indexes == "") {
show_help();
throw RdcException(RDC_ST_BAD_PARAMETER, "Need to specify the GPUs or group id");
} else {
std::vector<std::string> vec_ids = split_string(gpu_indexes, ',');
for (uint32_t i = 0; i < vec_ids.size(); i++) {
if (!IsNumber(vec_ids[i])) {
throw RdcException(RDC_ST_BAD_PARAMETER,
"The GPU index " + vec_ids[i] + " needs to be a number");
}
gpu_indexes_.push_back(std::stoi(vec_ids[i]));
}
}
}
@@ -207,6 +209,9 @@ void RdciDmonSubSystem::parse_cmd_opts(int argc, char** argv) {
if (options_.find(OPTIONS_COUNT) == options_.end()) {
options_.insert({OPTIONS_COUNT, std::numeric_limits<uint32_t>::max()});
}
// Store gpu indexes to parse later
raw_gpu_indexes_ = gpu_indexes;
}
void RdciDmonSubSystem::show_help() const {
@@ -272,8 +277,15 @@ void RdciDmonSubSystem::create_temp_group() {
for (uint32_t i = 0; i < gpu_indexes_.size(); i++) {
result = rdc_group_gpu_add(rdc_handle_, group_id, gpu_indexes_[i]);
if (result != RDC_ST_OK) {
throw RdcException(result,
"Fail to add " + std::to_string(gpu_indexes_[i]) + " to the dmon group.");
rdc_entity_info_t info = rdc_get_info_from_entity_index(gpu_indexes_[i]);
std::string info_str;
if (info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) {
info_str =
"g" + std::to_string(info.device_index) + "." + std::to_string(info.instance_index);
} else {
info_str = std::to_string(info.device_index);
}
throw RdcException(result, "Fail to add " + info_str + " to the dmon group.");
}
}
options_.insert({OPTIONS_GROUP_ID, group_id});
@@ -301,6 +313,73 @@ void RdciDmonSubSystem::create_temp_field_group() {
options_.insert({OPTIONS_FIELD_GROUP_ID, group_id});
}
void RdciDmonSubSystem::resolve_gpu_indexes() {
uint32_t device_list[RDC_MAX_NUM_DEVICES];
uint32_t count = 0;
rdc_status_t res = rdc_device_get_all(rdc_handle_, device_list, &count);
if (res != RDC_ST_OK) {
throw RdcException(res, "Failed to get all devices");
}
std::vector<std::string> vec_ids = split_string(raw_gpu_indexes_, ',');
for (uint32_t i = 0; i < vec_ids.size(); i++) {
if (rdc_is_partition_string(vec_ids[i].c_str())) {
uint32_t logicalPhysicalGpu;
uint32_t partition;
if (!rdc_parse_partition_string(vec_ids[i].c_str(), &logicalPhysicalGpu, &partition)) {
throw RdcException(RDC_ST_BAD_PARAMETER, "Invalid partition format: " + vec_ids[i]);
}
if (logicalPhysicalGpu >= count) {
throw RdcException(RDC_ST_BAD_PARAMETER,
"GPU " + std::to_string(logicalPhysicalGpu) + " is out of range");
}
uint32_t physicalGpu = device_list[logicalPhysicalGpu];
uint16_t num_partitions = 0;
rdc_status_t st = rdc_get_num_partition(rdc_handle_, physicalGpu, &num_partitions);
if (st != RDC_ST_OK) {
throw RdcException(st,
"Failed to get partition info for GPU " + std::to_string(physicalGpu));
}
if (num_partitions == UINT16_MAX || num_partitions <= 1) {
if (partition != 0) {
throw RdcException(RDC_ST_BAD_PARAMETER, "GPU " + std::to_string(physicalGpu) +
" is not partitioned, so partition " +
std::to_string(partition) + " is invalid");
}
} else {
if (partition >= num_partitions) {
throw RdcException(RDC_ST_BAD_PARAMETER,
"GPU " + std::to_string(physicalGpu) + " supports only " +
std::to_string(num_partitions) + " partitions, partition " +
std::to_string(partition) + " is invalid");
}
}
rdc_entity_info_t phys_info;
phys_info.device_index = physicalGpu;
phys_info.instance_index = partition;
phys_info.entity_role = RDC_DEVICE_ROLE_PARTITION_INSTANCE;
phys_info.device_type = RDC_DEVICE_TYPE_GPU;
uint32_t phys_entity_index = rdc_get_entity_index_from_info(phys_info);
gpu_indexes_.push_back(phys_entity_index);
} else if (IsNumber(vec_ids[i])) {
uint32_t logicalIndex = std::stoi(vec_ids[i]);
if (logicalIndex >= count) {
throw RdcException(RDC_ST_BAD_PARAMETER,
"GPU " + std::to_string(logicalIndex) + " is out of range");
}
gpu_indexes_.push_back(std::stoi(vec_ids[i]));
} else {
throw RdcException(RDC_ST_BAD_PARAMETER, "The GPU index " + vec_ids[i] +
" needs to be a number or a valid partition");
}
}
}
void RdciDmonSubSystem::show_field_usage() const {
std::cout << "Supported fields Ids:" << std::endl;
@@ -430,6 +509,8 @@ void RdciDmonSubSystem::process() {
rdc_group_info_t group_info;
rdc_field_group_info_t field_info;
resolve_gpu_indexes();
// Create a temporary group/field if pass as GPU indexes or field ids
create_temp_group();
create_temp_field_group();
@@ -516,7 +597,8 @@ void RdciDmonSubSystem::process() {
print_and_clr_notif_pq(&notif_pq, show_timpstamps_);
for (uint32_t gindex = 0; gindex < group_info.count; gindex++) {
std::cout << group_info.entity_ids[gindex] << "\t";
std::cout << std::setw(12) << std::left << entity_to_string(group_info.entity_ids[gindex])
<< "\t";
for (uint32_t findex = 0; findex < reg_fields.size(); findex++) {
rdc_field_value value;
@@ -184,6 +184,14 @@ class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service {
const ::rdc::ClearConfigRequest* request,
::rdc::ClearConfigResponse* reply) override;
::grpc::Status GetNumPartition(::grpc::ServerContext* context,
const ::rdc::GetNumPartitionRequest* request,
::rdc::GetNumPartitionResponse* reply) override;
::grpc::Status GetInstanceProfile(::grpc::ServerContext* context,
const ::rdc::GetInstanceProfileRequest* request,
::rdc::GetInstanceProfileResponse* reply) override;
private:
bool copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, ::rdc::GpuUsageInfo* target);
rdc_handle_t rdc_handle_;
+46 -3
Datei anzeigen
@@ -1071,7 +1071,7 @@ int RdcAPIServiceImpl::PolicyCallback(rdc_policy_callback_response_t* userData)
static_cast<::rdc::TopologyLinkInfo_LinkType>(topology_results.link_infos[i].link_type));
linkinfos->set_p2p_accessible(topology_results.link_infos[i].is_p2p_accessible);
}
return ::grpc::Status::OK;
return ::grpc::Status::OK;
}
::grpc::Status RdcAPIServiceImpl::SetConfig(::grpc::ServerContext* context,
@@ -1140,13 +1140,56 @@ int RdcAPIServiceImpl::PolicyCallback(rdc_policy_callback_response_t* userData)
gpulinkstatus->set_link_types(
static_cast<::rdc::GpuLinkStatus_LinkTypes>(link_status_results.gpus[i].link_types));
for (uint32_t n = 0; n < link_status_results.gpus[i].num_of_links; n++) {
gpulinkstatus->add_link_states(static_cast<::rdc::GpuLinkStatus_LinkState>(
link_status_results.gpus[i].link_states[n]));
gpulinkstatus->add_link_states(
static_cast<::rdc::GpuLinkStatus_LinkState>(link_status_results.gpus[i].link_states[n]));
}
}
return ::grpc::Status::OK;
}
::grpc::Status RdcAPIServiceImpl::GetNumPartition(::grpc::ServerContext* context,
const ::rdc::GetNumPartitionRequest* request,
::rdc::GetNumPartitionResponse* reply) {
(void)context;
if (!request || !reply) {
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty request or reply");
}
uint32_t gpu_index = request->gpu_index();
uint16_t num_partition = 0;
rdc_status_t result = rdc_get_num_partition(rdc_handle_, gpu_index, &num_partition);
reply->set_status(result);
if (result == RDC_ST_OK) {
reply->set_num_partition(num_partition);
}
return ::grpc::Status::OK;
}
::grpc::Status RdcAPIServiceImpl::GetInstanceProfile(
::grpc::ServerContext* context, const ::rdc::GetInstanceProfileRequest* request,
::rdc::GetInstanceProfileResponse* reply) {
(void)context;
if (!request || !reply) {
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty request or reply");
}
uint32_t entity_index = request->entity_index();
uint32_t resource_type = request->resource_type();
rdc_resource_profile_t profile;
memset(&profile, 0, sizeof(profile));
// Call the RDC API that (in embedded mode) uses AMD SMI
rdc_status_t result =
rdc_instance_profile_get(rdc_handle_, entity_index,
static_cast<rdc_instance_resource_type_t>(resource_type), &profile);
reply->set_status(result);
if (result == RDC_ST_OK) {
reply->set_partition_resource(profile.partition_resource);
reply->set_num_partitions_share_resource(profile.num_partitions_share_resource);
}
return ::grpc::Status::OK;
}
} // namespace rdc
} // namespace amd