[SWDEV-498711] RDC Partition Implementation (#119)
* [SWDEV-498711] RDC Partition Implementation
Change-Id: Ibfc3709793770537e4c9d36458f34c6b4f461724
Signed-off-by: adapryor <Adam.pryor@amd.com>
[ROCm/rdc commit: 47692d3ed5]
Dieser Commit ist enthalten in:
@@ -129,6 +129,11 @@ typedef enum { INTEGER = 0, DOUBLE, STRING, BLOB } rdc_field_type_t;
|
||||
*/
|
||||
#define RDC_MAX_NUM_DEVICES 128
|
||||
|
||||
/**
|
||||
* @brief Max number of partitions
|
||||
*/
|
||||
#define RDC_MAX_NUM_PARTITIONS 8
|
||||
|
||||
/**
|
||||
* @brief The max fields in a field group
|
||||
*/
|
||||
@@ -1617,6 +1622,139 @@ rdc_status_t rdc_config_clear(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_i
|
||||
|
||||
const char* get_rocm_path(const char* search_string);
|
||||
|
||||
/**
|
||||
* @brief The device role
|
||||
*/
|
||||
typedef enum {
|
||||
RDC_DEVICE_ROLE_PHYSICAL,
|
||||
RDC_DEVICE_ROLE_PARTITION_INSTANCE //!< The partition instance
|
||||
} rdc_device_role_t;
|
||||
|
||||
/**
|
||||
* @brief The device type
|
||||
*/
|
||||
typedef enum { RDC_DEVICE_TYPE_GPU, RDC_DEVICE_TYPE_CPU } rdc_device_type_t;
|
||||
|
||||
typedef struct {
|
||||
uint32_t device_index; //!< Physical device index
|
||||
uint32_t instance_index; //!< Instance or core index
|
||||
rdc_device_role_t entity_role; //!< Physical device or partition instance
|
||||
rdc_device_type_t device_type; //!< Type
|
||||
} rdc_entity_info_t;
|
||||
|
||||
/**
|
||||
* @brief The function to decode the entity info from entity index
|
||||
* @details
|
||||
* | 31 30 29| 28 27 | 21 20 19 ... 12 11 | 10 9 8 7 6 5 4 3 2 1 0 |
|
||||
* |---------|-------|--------------------|---------------------------|
|
||||
* | Type | Role | Instance | Device |
|
||||
* |---------|-------|--------------------|---------------------------|
|
||||
* the 32 bit entity index is crafted based on above structure, this function
|
||||
* will decode them into a data structure
|
||||
*
|
||||
* @param[in] entity_index The entity index.
|
||||
*
|
||||
* @retval rdc_entity_info_t is returned for decode structure
|
||||
*/
|
||||
|
||||
rdc_entity_info_t rdc_get_info_from_entity_index(uint32_t entity_index);
|
||||
|
||||
/**
|
||||
* @brief The function to encode the entity info to entity index
|
||||
* @details
|
||||
* | 31 30 29| 28 27 | 21 20 19 ... 12 11 | 10 9 8 7 6 5 4 3 2 1 0 |
|
||||
* |---------|-------|--------------------|---------------------------|
|
||||
* | Type | Role | Instance | Device |
|
||||
* |---------|-------|--------------------|---------------------------|
|
||||
* the 32 bit entity index is crafted based on above structure, this function
|
||||
* will encode them to index
|
||||
*
|
||||
* @param[in] info The entity info to encode.
|
||||
*
|
||||
* @retval entity_index is returned
|
||||
*/
|
||||
uint32_t rdc_get_entity_index_from_info(rdc_entity_info_t info);
|
||||
|
||||
// map from amdsmi_accelerator_partition_resource_type_t
|
||||
typedef enum {
|
||||
RDC_ACCELERATOR_XCC = 0,
|
||||
RDC_ACCELERATOR_ENCODER,
|
||||
RDC_ACCELERATOR_DECODER,
|
||||
RDC_ACCELERATOR_DMA,
|
||||
RDC_ACCELERATOR_JPEG,
|
||||
RDC_ACCELERATOR_RESOURCE_MAX,
|
||||
RDC_ACCELERATOR_LAST = RDC_ACCELERATOR_RESOURCE_MAX
|
||||
} rdc_instance_resource_type_t;
|
||||
|
||||
// map from amdsmi_accelerator_partition_resource_profile_t
|
||||
typedef struct {
|
||||
rdc_instance_resource_type_t resource_type;
|
||||
uint32_t partition_resource; // The resources a partition can be used, which may be shared
|
||||
uint32_t num_partitions_share_resource; // If it is greater than 1, then resource is shared.
|
||||
} rdc_resource_profile_t;
|
||||
|
||||
/**
|
||||
* @brief Query the resource allocation for a device/instance
|
||||
*
|
||||
* @details The profile contains detail information how resource is allocated.
|
||||
*
|
||||
* As an example, MI300X has 8 XCCs and 4 Decoders, in DPX mode, the physical device is
|
||||
* partitioned to 2 instances, so each instance will have 4 XCC and 2 Decoder and they are
|
||||
* not shared.
|
||||
* [XCC, 4, 0], [DECODER, 2, 0]
|
||||
*
|
||||
* If it is CPX mode, the physical device is partitioned to 8 instances, and each instance
|
||||
* have 1 XCC and 2 instances are sharing the same decoder.
|
||||
* [XCC, 1, 0], [DECODER, 1, 1]
|
||||
*
|
||||
* If entity_index is the physical device, it should return all resources of the device:
|
||||
* [XCC, 8, 0], [DECODER, 4, 0]
|
||||
*
|
||||
* @param[in] p_rdc_handle The RDC handler.
|
||||
*
|
||||
* @param[in] entity_index The GPU index to query. It can be physical device or instance.
|
||||
*
|
||||
* @param[in] resource_type Which resource type to query
|
||||
*
|
||||
* @param[out] profile The details how the resource is allocated.
|
||||
*
|
||||
* @retval ::RDC_ST_OK is returned upon successful call.
|
||||
*/
|
||||
rdc_status_t rdc_instance_profile_get(rdc_handle_t p_rdc_handle, uint32_t entity_index,
|
||||
rdc_instance_resource_type_t resource_type,
|
||||
rdc_resource_profile_t* profile);
|
||||
|
||||
/**
|
||||
* @brief Get the number of partitions for the specified GPU index.
|
||||
*
|
||||
* @param[in] p_rdc_handle The RDC handler.
|
||||
* @param[in] index The GPU index to query.
|
||||
* @param[out] num_partition Pointer to a variable to receive the number of partitions.
|
||||
*
|
||||
* @retval ::RDC_ST_OK on success.
|
||||
*/
|
||||
rdc_status_t rdc_get_num_partition(rdc_handle_t p_rdc_handle, uint32_t index,
|
||||
uint16_t* num_partition);
|
||||
|
||||
/**
|
||||
* @brief Check if gpuid is partition string
|
||||
*
|
||||
* @param[in] s - singular partition string
|
||||
* @retval bool - if partition string or not
|
||||
*/
|
||||
bool rdc_is_partition_string(const char* s);
|
||||
|
||||
/**
|
||||
* @brief Parse partition id into physical gpu and partition
|
||||
*
|
||||
* @param[in] s - singular partition string
|
||||
* @param[out] physicalGpu - socket id
|
||||
* @param[out] partition - partition id
|
||||
*
|
||||
* @retval bool - success
|
||||
*/
|
||||
bool rdc_parse_partition_string(const char* s, uint32_t* physicalGpu, uint32_t* partition);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif // __cplusplus
|
||||
|
||||
@@ -0,0 +1,54 @@
|
||||
/*
|
||||
Copyright (c) 2025 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#ifndef INCLUDE_RDC_LIB_RDCENTITYCODEC_H_
|
||||
#define INCLUDE_RDC_LIB_RDCENTITYCODEC_H_
|
||||
|
||||
#include "rdc/rdc.h"
|
||||
|
||||
/*
|
||||
*
|
||||
* See rdc.h for description of entity_index
|
||||
* Shifts and masks help get only the bits in question to decode/encode
|
||||
*
|
||||
* Ex, RDC_ENTITY_TYPE_SHIFT = 29 helps shift the 29 irrelevant bits, so we're
|
||||
* only left with the top 3 type bits.
|
||||
* Then, the corresponding 3 type bits are anded with the RDC_ENTITY_TYPE_MASK = 0x7
|
||||
* which = 111 in binary, "copying" the type bits.
|
||||
*
|
||||
*
|
||||
*/
|
||||
static constexpr uint32_t RDC_ENTITY_TYPE_SHIFT = 29;
|
||||
static constexpr uint32_t RDC_ENTITY_ROLE_SHIFT = 27;
|
||||
static constexpr uint32_t RDC_ENTITY_INSTANCE_SHIFT = 11;
|
||||
static constexpr uint32_t RDC_ENTITY_DEVICE_SHIFT = 0;
|
||||
|
||||
static constexpr uint32_t RDC_ENTITY_TYPE_MASK = 0x7; // 3 bits for type.
|
||||
static constexpr uint32_t RDC_ENTITY_ROLE_MASK = 0x3; // 2 bits for role.
|
||||
static constexpr uint32_t RDC_ENTITY_INSTANCE_MASK = 0x3FF; // 10 bits for instance.
|
||||
static constexpr uint32_t RDC_ENTITY_DEVICE_MASK = 0x3FF; // 10 bits for device.
|
||||
|
||||
rdc_entity_info_t rdc_get_info_from_entity_index(uint32_t entity_index);
|
||||
uint32_t rdc_get_entity_index_from_info(rdc_entity_info_t info);
|
||||
bool rdc_is_partition_string(const char* s);
|
||||
bool rdc_parse_partition_string(const char* s, uint32_t* physicalGpu, uint32_t* partition);
|
||||
|
||||
#endif // INCLUDE_RDC_LIB_RDCENTITYCODEC_H_
|
||||
@@ -116,7 +116,7 @@ class RdcHandler {
|
||||
virtual rdc_status_t rdc_health_check(rdc_gpu_group_t group_id,
|
||||
rdc_health_response_t* response) = 0;
|
||||
virtual rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) = 0;
|
||||
// topology API
|
||||
// topology API
|
||||
virtual rdc_status_t rdc_device_topology_get(uint32_t gpu_index,
|
||||
rdc_device_topology_t* results) = 0;
|
||||
virtual rdc_status_t rdc_link_status_get(rdc_link_status_t* results) = 0;
|
||||
@@ -131,6 +131,12 @@ class RdcHandler {
|
||||
// Clear the setting
|
||||
virtual rdc_status_t rdc_config_clear(rdc_gpu_group_t group_id) = 0;
|
||||
|
||||
virtual rdc_status_t rdc_get_num_partition(uint32_t index, uint16_t* num_partition) = 0;
|
||||
|
||||
virtual rdc_status_t rdc_instance_profile_get(uint32_t entity_index,
|
||||
rdc_instance_resource_type_t resource_type,
|
||||
rdc_resource_profile_t* profile) = 0;
|
||||
|
||||
virtual ~RdcHandler() {}
|
||||
};
|
||||
|
||||
|
||||
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
Copyright (c) 2025 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#ifndef INCLUDE_RDC_LIB_RDCPARTITION_H_
|
||||
#define INCLUDE_RDC_LIB_RDCPARTITION_H_
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "rdc/rdc.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
class RdcPartition {
|
||||
public:
|
||||
virtual rdc_status_t rdc_instance_profile_get_impl(uint32_t entity_index,
|
||||
rdc_instance_resource_type_t resource_type,
|
||||
rdc_resource_profile_t* profile) = 0;
|
||||
|
||||
virtual rdc_status_t rdc_get_num_partition_impl(uint32_t index, uint16_t* num_partition) = 0;
|
||||
|
||||
virtual ~RdcPartition() {}
|
||||
};
|
||||
typedef std::shared_ptr<RdcPartition> RdcPartitionPtr;
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
#endif // INCLUDE_RDC_LIB_RDCPARTITION_H_
|
||||
@@ -32,6 +32,7 @@ THE SOFTWARE.
|
||||
#include "rdc_lib/RdcMetricsUpdater.h"
|
||||
#include "rdc_lib/RdcModuleMgr.h"
|
||||
#include "rdc_lib/RdcNotification.h"
|
||||
#include "rdc_lib/RdcPartition.h"
|
||||
#include "rdc_lib/RdcPolicy.h"
|
||||
#include "rdc_lib/RdcTopologyLink.h"
|
||||
#include "rdc_lib/RdcWatchTable.h"
|
||||
@@ -121,7 +122,7 @@ class RdcEmbeddedHandler final : public RdcHandler {
|
||||
rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t* response) override;
|
||||
rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override;
|
||||
rdc_status_t rdc_device_topology_get(uint32_t gpu_index, rdc_device_topology_t* results) override;
|
||||
|
||||
|
||||
rdc_status_t rdc_link_status_get(rdc_link_status_t* results) override;
|
||||
|
||||
// Set one configure
|
||||
@@ -134,11 +135,18 @@ class RdcEmbeddedHandler final : public RdcHandler {
|
||||
// Clear the setting
|
||||
rdc_status_t rdc_config_clear(rdc_gpu_group_t group_id) override;
|
||||
|
||||
rdc_status_t rdc_get_num_partition(uint32_t index, uint16_t* num_partition) override;
|
||||
|
||||
rdc_status_t rdc_instance_profile_get(uint32_t entity_index,
|
||||
rdc_instance_resource_type_t resource_type,
|
||||
rdc_resource_profile_t* profile) override;
|
||||
|
||||
explicit RdcEmbeddedHandler(rdc_operation_mode_t op_mode);
|
||||
~RdcEmbeddedHandler() final;
|
||||
|
||||
private:
|
||||
rdc_status_t get_gpu_gauges(rdc_gpu_gauges_t* gpu_gauges);
|
||||
RdcPartitionPtr partition_;
|
||||
RdcGroupSettingsPtr group_settings_;
|
||||
RdcCacheManagerPtr cache_mgr_;
|
||||
RdcMetricFetcherPtr metric_fetcher_;
|
||||
@@ -150,7 +158,6 @@ class RdcEmbeddedHandler final : public RdcHandler {
|
||||
RdcTopologyLinkPtr topologylink_;
|
||||
RdcConfigSettingsPtr config_handler_;
|
||||
std::future<void> updater_;
|
||||
|
||||
};
|
||||
|
||||
} // namespace rdc
|
||||
|
||||
@@ -28,6 +28,7 @@ THE SOFTWARE.
|
||||
#include <string>
|
||||
|
||||
#include "rdc_lib/RdcGroupSettings.h"
|
||||
#include "rdc_lib/impl/RdcPartitionImpl.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
@@ -51,7 +52,7 @@ class RdcGroupSettingsImpl : public RdcGroupSettings {
|
||||
rdc_status_t rdc_group_field_get_all_ids(rdc_field_grp_t field_group_id_list[],
|
||||
uint32_t* count) override;
|
||||
|
||||
RdcGroupSettingsImpl();
|
||||
explicit RdcGroupSettingsImpl(const RdcPartitionPtr& partition);
|
||||
|
||||
private:
|
||||
std::map<rdc_gpu_group_t, rdc_group_info_t> gpu_group_;
|
||||
@@ -60,6 +61,7 @@ class RdcGroupSettingsImpl : public RdcGroupSettings {
|
||||
uint32_t cur_field_group_id_ = 0;
|
||||
std::mutex group_mutex_;
|
||||
std::mutex field_group_mutex_;
|
||||
RdcPartitionPtr partition_;
|
||||
};
|
||||
|
||||
} // namespace rdc
|
||||
|
||||
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
Copyright (c) 2025 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#ifndef INCLUDE_RDC_LIB_IMPL_RDCPARTITIONIMPL_H_
|
||||
#define INCLUDE_RDC_LIB_IMPL_RDCPARTITIONIMPL_H_
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc_lib/RdcPartition.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
class RdcPartitionImpl : public RdcPartition {
|
||||
public:
|
||||
rdc_status_t rdc_instance_profile_get_impl(uint32_t entity_index,
|
||||
rdc_instance_resource_type_t resource_type,
|
||||
rdc_resource_profile_t* profile);
|
||||
rdc_status_t rdc_get_num_partition_impl(uint32_t index, uint16_t* num_partition);
|
||||
};
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
#endif // INCLUDE_RDC_LIB_IMPL_RDCPARTITIONIMPL_H_
|
||||
@@ -126,9 +126,15 @@ class RdcStandaloneHandler : public RdcHandler {
|
||||
rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t* response) override;
|
||||
rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override;
|
||||
rdc_status_t rdc_device_topology_get(uint32_t gpu_index, rdc_device_topology_t* results) override;
|
||||
|
||||
|
||||
rdc_status_t rdc_link_status_get(rdc_link_status_t* results) override;
|
||||
|
||||
rdc_status_t rdc_get_num_partition(uint32_t index, uint16_t* num_partition) override;
|
||||
|
||||
rdc_status_t rdc_instance_profile_get(uint32_t entity_index,
|
||||
rdc_instance_resource_type_t resource_type,
|
||||
rdc_resource_profile_t* profile) override;
|
||||
|
||||
explicit RdcStandaloneHandler(const char* ip_and_port, const char* root_ca,
|
||||
const char* client_cert, const char* client_key);
|
||||
|
||||
|
||||
@@ -23,6 +23,8 @@ THE SOFTWARE.
|
||||
#ifndef INCLUDE_RDC_LIB_IMPL_RSMIUTILS_H_
|
||||
#define INCLUDE_RDC_LIB_IMPL_RSMIUTILS_H_
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "rdc/rdc.h"
|
||||
|
||||
@@ -33,6 +35,12 @@ rdc_status_t Smi2RdcError(amdsmi_status_t rsmi);
|
||||
amdsmi_status_t get_processor_handle_from_id(uint32_t gpu_id,
|
||||
amdsmi_processor_handle* processor_handle);
|
||||
amdsmi_status_t get_processor_count(uint32_t& all_processor_count);
|
||||
amdsmi_status_t get_socket_handles(std::vector<amdsmi_socket_handle>& sockets);
|
||||
amdsmi_status_t get_processor_handles(amdsmi_socket_handle socket,
|
||||
std::vector<amdsmi_processor_handle>& processors);
|
||||
amdsmi_status_t get_kfd_partition_id(amdsmi_processor_handle proc, uint32_t* partition_id);
|
||||
amdsmi_status_t get_metrics_info(amdsmi_processor_handle proc, amdsmi_gpu_metrics_t* metrics);
|
||||
amdsmi_status_t get_num_partition(uint32_t index, uint16_t* num_partition);
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
@@ -219,6 +219,13 @@ service RdcAPI {
|
||||
|
||||
// rdc_status_t GetLinkStatus()
|
||||
rpc GetLinkStatus(Empty) returns (GetLinkStatusResponse) {}
|
||||
|
||||
// Get number of partitions
|
||||
rpc GetNumPartition(GetNumPartitionRequest) returns (GetNumPartitionResponse);
|
||||
|
||||
// Get instance profile of gpu
|
||||
rpc GetInstanceProfile(GetInstanceProfileRequest) returns (GetInstanceProfileResponse);
|
||||
|
||||
}
|
||||
|
||||
message Empty {
|
||||
@@ -804,3 +811,28 @@ message ClearConfigRequest {
|
||||
message ClearConfigResponse {
|
||||
uint32 status = 1;
|
||||
}
|
||||
|
||||
// Request for getting the number of partitions for a given GPU index.
|
||||
message GetNumPartitionRequest {
|
||||
// The GPU index for which to query the number of partitions.
|
||||
uint32 gpu_index = 1;
|
||||
}
|
||||
|
||||
// Response for getting the number of partitions.
|
||||
message GetNumPartitionResponse {
|
||||
// Status of the operation, following RDC_ST_* codes.
|
||||
uint32 status = 1;
|
||||
// Number of partitions for the given GPU.
|
||||
uint32 num_partition = 2;
|
||||
}
|
||||
|
||||
message GetInstanceProfileRequest {
|
||||
uint32 entity_index = 1;
|
||||
uint32 resource_type = 2;
|
||||
}
|
||||
|
||||
message GetInstanceProfileResponse {
|
||||
uint32 status = 1;
|
||||
uint32 partition_resource = 2;
|
||||
uint32 num_partitions_share_resource = 3;
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import os,time
|
||||
from rdc_bootstrap import *
|
||||
from RdcUtil import RdcUtil
|
||||
from typing import Dict
|
||||
|
||||
default_field_ids = [
|
||||
rdc_field_t.RDC_FI_GPU_MEMORY_USAGE,
|
||||
@@ -26,10 +27,18 @@ default_unit_coverter = {
|
||||
rdc_field_t.RDC_FI_GPU_TEMP: 0.001, # degree
|
||||
}
|
||||
|
||||
class rdc_entity_info_t(Structure):
|
||||
_fields_ = [
|
||||
("device_type", c_uint32),
|
||||
("entity_role", c_uint32),
|
||||
("instance_index", c_uint32),
|
||||
("device_index", c_uint32),
|
||||
]
|
||||
|
||||
class RdcReader:
|
||||
# To run the RDC in embedded mode, set the ip_port = None
|
||||
def __init__(self, ip_port = "localhost:50051", field_ids = default_field_ids,
|
||||
unit_converter: dict[int, float] = default_unit_coverter,
|
||||
unit_converter: Dict[int, float] = default_unit_coverter,
|
||||
update_freq = 10000000, max_keep_age = 3600.0 , max_keep_samples = 1000,
|
||||
field_group_name = "rdc_reader_field_group", gpu_group_name = "rdc_reader_gpu_group",
|
||||
gpu_indexes = None, root_ca = "/etc/rdc/client/certs/rdc_cacert.pem",
|
||||
@@ -44,6 +53,11 @@ class RdcReader:
|
||||
self.unit_converter = unit_converter
|
||||
self.rdc_handle = c_void_p()
|
||||
|
||||
rdc.rdc_get_entity_index_from_info.argtypes = [rdc_entity_info_t]
|
||||
rdc.rdc_get_entity_index_from_info.restype = c_uint32
|
||||
rdc.rdc_get_info_from_entity_index.argtypes = [c_uint32]
|
||||
rdc.rdc_get_info_from_entity_index.restype = rdc_entity_info_t
|
||||
|
||||
self.is_standalone = True
|
||||
if not ip_port: # embedded
|
||||
self.is_standalone = False
|
||||
@@ -69,7 +83,25 @@ class RdcReader:
|
||||
if gpu_indexes == None:
|
||||
self.gpu_indexes = self.rdc_util.get_all_gpu_indexes(self.rdc_handle)
|
||||
else:
|
||||
self.gpu_indexes = gpu_indexes
|
||||
self.gpu_indexes = []
|
||||
for idx in gpu_indexes:
|
||||
idx_str = str(idx)
|
||||
encoded = idx_str.encode("utf-8")
|
||||
phys_gpu = ctypes.c_uint32()
|
||||
part_idx = ctypes.c_uint32()
|
||||
if rdc.rdc_is_partition_string(encoded):
|
||||
rc = rdc.rdc_parse_partition_string(encoded, ctypes.byref(phys_gpu), ctypes.byref(part_idx))
|
||||
if not rc:
|
||||
raise Exception("Rdc failed to parse partition string")
|
||||
info = rdc_entity_info_t()
|
||||
info.device_type = 0 #RDC_DEVICE_TYPE_GPU
|
||||
info.entity_role = 1 #RDC_DEVICE_ROLE_PARTITION
|
||||
info.instance_index = part_idx
|
||||
info.device_index = phys_gpu
|
||||
entity = rdc.rdc_get_entity_index_from_info(info)
|
||||
self.gpu_indexes.append(entity)
|
||||
else:
|
||||
self.gpu_indexes.append(int(idx_str))
|
||||
self.gpu_group_id, gpu_group_created = self.rdc_util.create_gpu_group(self.rdc_handle, self.gpu_group_name, self.gpu_indexes)
|
||||
|
||||
# Create the field group
|
||||
@@ -140,8 +172,16 @@ class RdcReader:
|
||||
|
||||
|
||||
def handle_field(self, gpu_index, value):
|
||||
|
||||
info = rdc.rdc_get_info_from_entity_index(gpu_index)
|
||||
|
||||
if info.entity_role == 1: #RDC_DEVICE_ROLE_PARTITION_INSTANCE
|
||||
gpu_str = f"g{info.device_index}.{info.instance_index}"
|
||||
else:
|
||||
gpu_str = str(info.device_index)
|
||||
|
||||
field_name = self.rdc_util.field_id_string(value.field_id)
|
||||
print("%d %d:%d %s:%d" % (value.ts, gpu_index, value.field_id.value, field_name, value.value.l_int))
|
||||
print("%d %s:%d %s:%d" % (value.ts, gpu_str, value.field_id.value, field_name, value.value.l_int))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -12,12 +12,14 @@ set(BOOTSTRAP_LIB_SRC_LIST
|
||||
"${COMMON_DIR}/rdc_fields_supported.cc"
|
||||
"${SRC_DIR}/RdcBootStrap.cc"
|
||||
"${SRC_DIR}/RdcLibraryLoader.cc"
|
||||
"${SRC_DIR}/RdcLogger.cc")
|
||||
"${SRC_DIR}/RdcLogger.cc"
|
||||
"${SRC_DIR}/RdcEntityCodec.cc")
|
||||
set(BOOTSTRAP_LIB_INC_LIST
|
||||
"${COMMON_DIR}/rdc_fields_supported.h"
|
||||
"${INC_DIR}/RdcHandler.h"
|
||||
"${INC_DIR}/RdcLibraryLoader.h"
|
||||
"${INC_DIR}/RdcLogger.h"
|
||||
"${INC_DIR}/RdcEntityCodec.h"
|
||||
"${INC_DIR}/rdc_common.h"
|
||||
"${PROJECT_SOURCE_DIR}/include/rdc/rdc.h")
|
||||
message("BOOTSTRAP_LIB_INC_LIST=${BOOTSTRAP_LIB_INC_LIST}")
|
||||
|
||||
@@ -532,8 +532,26 @@ rdc_status_t rdc_link_status_get(rdc_handle_t p_rdc_handle, rdc_link_status_t* r
|
||||
if (!p_rdc_handle) {
|
||||
return RDC_ST_INVALID_HANDLER;
|
||||
}
|
||||
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_link_status_get(results);
|
||||
}
|
||||
|
||||
rdc_status_t rdc_get_num_partition(rdc_handle_t p_rdc_handle, uint32_t index,
|
||||
uint16_t* num_partition) {
|
||||
if (!p_rdc_handle || !num_partition) {
|
||||
return RDC_ST_INVALID_HANDLER;
|
||||
}
|
||||
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
|
||||
->rdc_link_status_get(results);
|
||||
->rdc_get_num_partition(index, num_partition);
|
||||
}
|
||||
|
||||
rdc_status_t rdc_instance_profile_get(rdc_handle_t p_rdc_handle, uint32_t entity_index,
|
||||
rdc_instance_resource_type_t resource_type,
|
||||
rdc_resource_profile_t* profile) {
|
||||
if (!p_rdc_handle || !profile) {
|
||||
return RDC_ST_INVALID_HANDLER;
|
||||
}
|
||||
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
|
||||
->rdc_instance_profile_get(entity_index, resource_type, profile);
|
||||
}
|
||||
|
||||
const char * get_rocm_path(const char * search_string) {
|
||||
@@ -573,4 +591,3 @@ const char * get_rocm_path(const char * search_string) {
|
||||
|
||||
return rocm_path.c_str();
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,111 @@
|
||||
/*
|
||||
Copyright (c) 2025 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <rdc/rdc.h>
|
||||
#include <rdc_lib/RdcEntityCodec.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include "common/rdc_utils.h"
|
||||
|
||||
rdc_entity_info_t rdc_get_info_from_entity_index(uint32_t entity_index) {
|
||||
rdc_entity_info_t info;
|
||||
info.device_type =
|
||||
(rdc_device_type_t)((entity_index >> RDC_ENTITY_TYPE_SHIFT) & RDC_ENTITY_TYPE_MASK);
|
||||
info.entity_role =
|
||||
(rdc_device_role_t)((entity_index >> RDC_ENTITY_ROLE_SHIFT) & RDC_ENTITY_ROLE_MASK);
|
||||
info.instance_index = (entity_index >> RDC_ENTITY_INSTANCE_SHIFT) & RDC_ENTITY_INSTANCE_MASK;
|
||||
info.device_index = (entity_index >> RDC_ENTITY_DEVICE_SHIFT) & RDC_ENTITY_DEVICE_MASK;
|
||||
return info;
|
||||
}
|
||||
|
||||
uint32_t rdc_get_entity_index_from_info(rdc_entity_info_t info) {
|
||||
uint32_t entity_index = 0;
|
||||
entity_index |= ((info.device_type & RDC_ENTITY_TYPE_MASK) << RDC_ENTITY_TYPE_SHIFT);
|
||||
entity_index |= ((info.entity_role & RDC_ENTITY_ROLE_MASK) << RDC_ENTITY_ROLE_SHIFT);
|
||||
entity_index |= ((info.instance_index & RDC_ENTITY_INSTANCE_MASK) << RDC_ENTITY_INSTANCE_SHIFT);
|
||||
entity_index |= ((info.device_index & RDC_ENTITY_DEVICE_MASK) << RDC_ENTITY_DEVICE_SHIFT);
|
||||
return entity_index;
|
||||
}
|
||||
|
||||
bool rdc_is_partition_string(const char* s) {
|
||||
if (!s || s[0] == '\0') {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (s[0] != 'g') {
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string str(s);
|
||||
size_t dotPos = str.find('.');
|
||||
if (dotPos == std::string::npos) return false;
|
||||
|
||||
if (dotPos <= 1 || dotPos >= str.size() - 1) return false;
|
||||
|
||||
std::string gpuPart = str.substr(1, dotPos - 1);
|
||||
std::string partitionPart = str.substr(dotPos + 1);
|
||||
|
||||
if (!std::all_of(gpuPart.begin(), gpuPart.end(), ::isdigit) ||
|
||||
!std::all_of(partitionPart.begin(), partitionPart.end(), ::isdigit))
|
||||
return false;
|
||||
|
||||
int gpuIndex = std::stoi(gpuPart);
|
||||
int partitionIndex = std::stoi(partitionPart);
|
||||
|
||||
if (gpuIndex < 0 || gpuIndex >= RDC_MAX_NUM_DEVICES) return false;
|
||||
if (partitionIndex < 0 || partitionIndex >= RDC_MAX_NUM_PARTITIONS) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool rdc_parse_partition_string(const char* s, uint32_t* physicalGpu, uint32_t* partition) {
|
||||
if (!s) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!rdc_is_partition_string(s)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string str(s);
|
||||
|
||||
std::string rest = str.substr(1);
|
||||
size_t pos = rest.find('.');
|
||||
|
||||
if (pos == std::string::npos) return false;
|
||||
|
||||
std::string gpuStr = rest.substr(0, pos);
|
||||
std::string partStr = rest.substr(pos + 1);
|
||||
|
||||
// Ensure both parts are a number
|
||||
if (!(!gpuStr.empty() && std::all_of(gpuStr.begin(), gpuStr.end(), ::isdigit)) ||
|
||||
!(!partStr.empty() && std::all_of(partStr.begin(), partStr.end(), ::isdigit))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
*physicalGpu = std::stoi(gpuStr);
|
||||
*partition = std::stoi(partStr);
|
||||
return true;
|
||||
}
|
||||
@@ -28,6 +28,7 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST}
|
||||
"${SRC_DIR}/RdcConfigSettingsImpl.cc"
|
||||
"${SRC_DIR}/RdcTelemetryModule.cc"
|
||||
"${SRC_DIR}/RdcWatchTableImpl.cc"
|
||||
"${SRC_DIR}/RdcPartitionImpl.cc"
|
||||
"${SRC_DIR}/SmiUtils.cc")
|
||||
|
||||
# TODO: remove all headers? Will just dir be ok after install?
|
||||
@@ -60,6 +61,7 @@ set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST}
|
||||
"${INC_DIR}/impl/RdcSmiLib.h"
|
||||
"${INC_DIR}/impl/RdcTelemetryModule.h"
|
||||
"${INC_DIR}/impl/RdcWatchTableImpl.h"
|
||||
"${INC_DIR}/impl/RdcPartitionImpl.h"
|
||||
"${INC_DIR}/impl/SmiUtils.h")
|
||||
|
||||
message("RDC_LIB_INC_LIST=${RDC_LIB_INC_LIST}")
|
||||
|
||||
@@ -36,6 +36,7 @@ THE SOFTWARE.
|
||||
#include "rdc_lib/impl/RdcMetricsUpdaterImpl.h"
|
||||
#include "rdc_lib/impl/RdcModuleMgrImpl.h"
|
||||
#include "rdc_lib/impl/RdcNotificationImpl.h"
|
||||
#include "rdc_lib/impl/RdcPartitionImpl.h"
|
||||
#include "rdc_lib/impl/RdcPolicyImpl.h"
|
||||
#include "rdc_lib/impl/RdcTopologyLinkImpl.h"
|
||||
#include "rdc_lib/impl/RdcWatchTableImpl.h"
|
||||
@@ -76,7 +77,8 @@ namespace rdc {
|
||||
const uint32_t METIC_UPDATE_FREQUENCY = 1000; // 1000 microseconds by default
|
||||
|
||||
RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode)
|
||||
: group_settings_(new RdcGroupSettingsImpl()),
|
||||
: partition_(new RdcPartitionImpl()),
|
||||
group_settings_(new RdcGroupSettingsImpl(partition_)),
|
||||
cache_mgr_(new RdcCacheManagerImpl()),
|
||||
metric_fetcher_(new RdcMetricFetcherImpl()),
|
||||
rdc_module_mgr_(new RdcModuleMgrImpl(metric_fetcher_)),
|
||||
@@ -261,9 +263,14 @@ rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_add(rdc_gpu_group_t group_id, uin
|
||||
if (status != RDC_ST_OK) {
|
||||
return status;
|
||||
}
|
||||
|
||||
rdc_entity_info_t info = rdc_get_info_from_entity_index(gpu_index);
|
||||
|
||||
uint32_t physical_gpu = info.device_index;
|
||||
|
||||
bool is_gpu_exist = false;
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (gpu_index_list[i] == gpu_index) {
|
||||
if (gpu_index_list[i] == physical_gpu) {
|
||||
is_gpu_exist = true;
|
||||
break;
|
||||
}
|
||||
@@ -527,5 +534,14 @@ rdc_status_t RdcEmbeddedHandler::rdc_config_clear(rdc_gpu_group_t group_id) {
|
||||
return config_handler_->rdc_config_clear(group_id);
|
||||
}
|
||||
|
||||
rdc_status_t RdcEmbeddedHandler::rdc_get_num_partition(uint32_t index, uint16_t* num_partition) {
|
||||
return partition_->rdc_get_num_partition_impl(index, num_partition);
|
||||
}
|
||||
|
||||
rdc_status_t RdcEmbeddedHandler::rdc_instance_profile_get(
|
||||
uint32_t entity_index, rdc_instance_resource_type_t resource_type,
|
||||
rdc_resource_profile_t* profile) {
|
||||
return partition_->rdc_instance_profile_get_impl(entity_index, resource_type, profile);
|
||||
}
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
@@ -23,13 +23,17 @@ THE SOFTWARE.
|
||||
|
||||
#include <ctime>
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "rdc_lib/RdcLogger.h"
|
||||
#include "rdc_lib/impl/RdcPartitionImpl.h"
|
||||
#include "rdc_lib/impl/SmiUtils.h"
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
RdcGroupSettingsImpl::RdcGroupSettingsImpl() {
|
||||
RdcGroupSettingsImpl::RdcGroupSettingsImpl(const RdcPartitionPtr& partition)
|
||||
: partition_(partition) {
|
||||
// Add the default job stats fields
|
||||
rdc_field_t job_fields[] = {RDC_FI_GPU_MEMORY_USAGE, RDC_FI_POWER_USAGE, RDC_FI_GPU_CLOCK,
|
||||
RDC_FI_GPU_UTIL, RDC_FI_PCIE_TX, RDC_FI_PCIE_RX,
|
||||
@@ -67,23 +71,50 @@ rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_destroy(rdc_gpu_group_t p_rdc_g
|
||||
rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_add(rdc_gpu_group_t groupId, uint32_t gpu_index) {
|
||||
std::lock_guard<std::mutex> guard(group_mutex_);
|
||||
auto ite = gpu_group_.find(groupId);
|
||||
if (ite != gpu_group_.end()) {
|
||||
// Check whether the index already exists
|
||||
for (uint32_t i = 0; i < ite->second.count; i++) {
|
||||
if (ite->second.entity_ids[i] == gpu_index) {
|
||||
RDC_LOG(RDC_INFO, "Fail to add " << gpu_index << " to GPU group " << groupId
|
||||
<< " as it is already exists");
|
||||
if (ite == gpu_group_.end()) {
|
||||
return RDC_ST_NOT_FOUND;
|
||||
}
|
||||
|
||||
rdc_entity_info_t entity_info = rdc_get_info_from_entity_index(gpu_index);
|
||||
|
||||
uint16_t num_partitions = 0;
|
||||
rdc_status_t status =
|
||||
partition_->rdc_get_num_partition_impl(entity_info.device_index, &num_partitions);
|
||||
if (status != RDC_ST_OK) {
|
||||
return status;
|
||||
}
|
||||
|
||||
if (num_partitions != UINT16_MAX && num_partitions > 1) {
|
||||
if (entity_info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) {
|
||||
if (entity_info.instance_index >= num_partitions) {
|
||||
RDC_LOG(RDC_INFO, "Invalid partition instance: GPU "
|
||||
<< entity_info.device_index << " supports " << num_partitions
|
||||
<< " partitions, but instance index is "
|
||||
<< entity_info.instance_index);
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
}
|
||||
if (ite->second.count < RDC_GROUP_MAX_ENTITIES) {
|
||||
ite->second.entity_ids[ite->second.count] = gpu_index;
|
||||
ite->second.count++;
|
||||
} else {
|
||||
return RDC_ST_MAX_LIMIT;
|
||||
}
|
||||
} else {
|
||||
return RDC_ST_NOT_FOUND;
|
||||
if (entity_info.entity_role != RDC_DEVICE_ROLE_PHYSICAL) {
|
||||
RDC_LOG(RDC_INFO, "GPU " << entity_info.device_index
|
||||
<< " is not partitionable, but a partition instance was provided.");
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
}
|
||||
|
||||
// Check whether the index already exists
|
||||
for (uint32_t i = 0; i < ite->second.count; i++) {
|
||||
if (ite->second.entity_ids[i] == gpu_index) {
|
||||
RDC_LOG(RDC_INFO, "Fail to add " << gpu_index << " to GPU group " << groupId
|
||||
<< " as it is already exists");
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
}
|
||||
if (ite->second.count < RDC_GROUP_MAX_ENTITIES) {
|
||||
ite->second.entity_ids[ite->second.count] = gpu_index;
|
||||
ite->second.count++;
|
||||
} else {
|
||||
return RDC_ST_MAX_LIMIT;
|
||||
}
|
||||
|
||||
return RDC_ST_OK;
|
||||
|
||||
@@ -471,9 +471,18 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
|
||||
|
||||
amdsmi_processor_handle processor_handle = {};
|
||||
|
||||
amdsmi_status_t ret = get_processor_handle_from_id(gpu_index, &processor_handle);
|
||||
rdc_entity_info_t info = rdc_get_info_from_entity_index(gpu_index);
|
||||
|
||||
amdsmi_status_t ret = get_processor_handle_from_id(info.device_index, &processor_handle);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR, "Failed to get processor handle for GPU " << gpu_index << " error: " << ret);
|
||||
std::string info_str;
|
||||
if (info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) {
|
||||
info_str =
|
||||
"g" + std::to_string(info.device_index) + "." + std::to_string(info.instance_index);
|
||||
} else {
|
||||
info_str = std::to_string(info.device_index);
|
||||
}
|
||||
RDC_LOG(RDC_ERROR, "Failed to get processor handle for GPU " << info_str << " error: " << ret);
|
||||
return Smi2RdcError(ret);
|
||||
}
|
||||
|
||||
@@ -486,6 +495,138 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
|
||||
value->field_id = field_id;
|
||||
value->status = AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
|
||||
if (info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) {
|
||||
uint16_t num_partitions = 0;
|
||||
amdsmi_status_t st = get_num_partition(info.device_index, &num_partitions);
|
||||
if (st != AMDSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR, "Failed to get partition info for GPU " << info.device_index);
|
||||
return RDC_ST_UNKNOWN_ERROR;
|
||||
}
|
||||
|
||||
amdsmi_processor_handle processor_handle = {};
|
||||
amdsmi_status_t ret = get_processor_handle_from_id(gpu_index, &processor_handle);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR, "Cannot get processor handle for partition " << info.instance_index);
|
||||
return Smi2RdcError(ret);
|
||||
}
|
||||
|
||||
amdsmi_gpu_metrics_t gpu_metrics = {};
|
||||
ret = amdsmi_get_gpu_metrics_info(processor_handle, &gpu_metrics);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR, "Failed to get GPU metrics info for partition " << info.instance_index);
|
||||
return Smi2RdcError(ret);
|
||||
}
|
||||
|
||||
switch (field_id) {
|
||||
case RDC_FI_GPU_CLOCK: {
|
||||
const uint16_t* clock_array = gpu_metrics.current_gfxclks;
|
||||
std::vector<uint16_t> valid_clocks;
|
||||
valid_clocks.reserve(8);
|
||||
|
||||
for (uint32_t i = 0; i < 8; i++) {
|
||||
uint16_t clk = clock_array[i];
|
||||
if (clk != 0 && clk != 0xFFFF) {
|
||||
valid_clocks.push_back(clk);
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t vc = static_cast<uint32_t>(valid_clocks.size());
|
||||
uint32_t pCount = static_cast<uint32_t>(num_partitions);
|
||||
uint32_t partIdx = info.instance_index;
|
||||
|
||||
if (valid_clocks.empty() || vc < num_partitions) {
|
||||
RDC_LOG(RDC_ERROR, "No valid clocks, or less than total partitions");
|
||||
return RDC_ST_NO_DATA;
|
||||
}
|
||||
|
||||
if (vc == num_partitions) {
|
||||
value->value.l_int = clock_array[info.instance_index] * 1000000;
|
||||
value->type = INTEGER;
|
||||
value->status = RDC_ST_OK;
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
uint32_t chunk_size = vc / pCount;
|
||||
uint32_t start_idx = partIdx * chunk_size;
|
||||
uint32_t end_idx = start_idx + chunk_size;
|
||||
|
||||
// Average partition clocks
|
||||
uint64_t sum = 0;
|
||||
for (uint32_t i = start_idx; i < end_idx; i++) {
|
||||
sum += valid_clocks[i];
|
||||
}
|
||||
uint32_t count = end_idx - start_idx;
|
||||
if (count == 0) {
|
||||
return RDC_ST_NO_DATA;
|
||||
}
|
||||
uint64_t avg_clock = sum / count;
|
||||
|
||||
value->value.l_int = avg_clock * 1000000;
|
||||
value->type = INTEGER;
|
||||
value->status = RDC_ST_OK;
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
case RDC_FI_GPU_UTIL: {
|
||||
uint32_t p = info.instance_index;
|
||||
if (p >= AMDSMI_MAX_NUM_XCP) {
|
||||
return RDC_ST_NO_DATA;
|
||||
}
|
||||
const amdsmi_gpu_xcp_metrics_t& xcp = gpu_metrics.xcp_stats[p];
|
||||
|
||||
uint64_t sum = 0;
|
||||
uint32_t count = 0;
|
||||
for (uint32_t i = 0; i < AMDSMI_MAX_NUM_XCC; i++) {
|
||||
uint32_t busy = xcp.gfx_busy_inst[i];
|
||||
if (busy != UINT32_MAX) {
|
||||
sum += busy;
|
||||
count++;
|
||||
}
|
||||
}
|
||||
if (count == 0) {
|
||||
return RDC_ST_NO_DATA;
|
||||
}
|
||||
uint64_t avg_busy = sum / count;
|
||||
value->value.l_int = avg_busy;
|
||||
value->type = INTEGER;
|
||||
value->status = RDC_ST_OK;
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
case RDC_FI_GPU_MM_DEC_UTIL: {
|
||||
uint32_t p = info.instance_index;
|
||||
if (p >= AMDSMI_MAX_NUM_XCP) {
|
||||
return RDC_ST_NO_DATA;
|
||||
}
|
||||
const amdsmi_gpu_xcp_metrics_t& xcp = gpu_metrics.xcp_stats[p];
|
||||
|
||||
uint64_t sum = 0;
|
||||
uint32_t count = 0;
|
||||
for (uint32_t i = 0; i < AMDSMI_MAX_NUM_VCN; i++) {
|
||||
uint16_t vcn = xcp.vcn_busy[i];
|
||||
if (vcn != UINT16_MAX) {
|
||||
sum += vcn;
|
||||
count++;
|
||||
}
|
||||
}
|
||||
if (count == 0) {
|
||||
return RDC_ST_NO_DATA;
|
||||
}
|
||||
uint64_t avg_decode = sum / count;
|
||||
value->value.l_int = avg_decode;
|
||||
value->type = INTEGER;
|
||||
value->status = RDC_ST_OK;
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
default:
|
||||
// All other fields => N/A for partition
|
||||
RDC_LOG(RDC_DEBUG, "Partition " << gpu_index << ": Field " << field_id_string(field_id)
|
||||
<< " not supported => NO_DATA.");
|
||||
return RDC_ST_NO_DATA;
|
||||
}
|
||||
} // end if partition
|
||||
|
||||
auto read_smi_counter = [&](void) {
|
||||
RdcFieldKey f_key(gpu_index, field_id);
|
||||
smi_data = get_smi_data(f_key);
|
||||
@@ -600,12 +741,11 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
|
||||
break;
|
||||
}
|
||||
case RDC_FI_GPU_COUNT: {
|
||||
uint32_t processor_count = 0;
|
||||
// amdsmi is initialized in AMDSMI_INIT_AMD_GPUS mode -> returned sockets are GPUs
|
||||
value->status = get_processor_count(processor_count);
|
||||
uint32_t socket_count = 0;
|
||||
value->status = amdsmi_get_socket_handles(&socket_count, nullptr);
|
||||
value->type = INTEGER;
|
||||
if (value->status == AMDSMI_STATUS_SUCCESS) {
|
||||
value->value.l_int = static_cast<int64_t>(processor_count);
|
||||
value->value.l_int = static_cast<int64_t>(socket_count);
|
||||
}
|
||||
} break;
|
||||
case RDC_FI_POWER_USAGE: {
|
||||
@@ -913,8 +1053,9 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
|
||||
value->value.l_int = static_cast<int64_t>(pending_page_num);
|
||||
}
|
||||
}
|
||||
} else
|
||||
} else {
|
||||
value->status = Smi2RdcError(ret);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,117 @@
|
||||
/*
|
||||
Copyright (c) 2025 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#include "rdc_lib/impl/RdcPartitionImpl.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc_lib/impl/SmiUtils.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
rdc_status_t RdcPartitionImpl::rdc_instance_profile_get_impl(
|
||||
uint32_t entity_index, rdc_instance_resource_type_t resource_type,
|
||||
rdc_resource_profile_t* profile) {
|
||||
if (profile == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
profile->partition_resource = 0;
|
||||
profile->num_partitions_share_resource = 0;
|
||||
|
||||
rdc_entity_info_t info = rdc_get_info_from_entity_index(entity_index);
|
||||
|
||||
amdsmi_processor_handle proc_handle;
|
||||
// Get processor handle of socket
|
||||
amdsmi_status_t ret = get_processor_handle_from_id(info.device_index, &proc_handle);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return RDC_ST_UNKNOWN_ERROR;
|
||||
}
|
||||
|
||||
amdsmi_accelerator_partition_profile_config_t config;
|
||||
memset(&config, 0, sizeof(config));
|
||||
ret = amdsmi_get_gpu_accelerator_partition_profile_config(proc_handle, &config);
|
||||
if (ret == AMDSMI_STATUS_NOT_SUPPORTED) {
|
||||
return RDC_ST_OK;
|
||||
} else if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return RDC_ST_UNKNOWN_ERROR;
|
||||
}
|
||||
|
||||
amdsmi_accelerator_partition_profile_t active_profile;
|
||||
memset(&active_profile, 0, sizeof(active_profile));
|
||||
uint32_t num = 0; // This is unused
|
||||
ret = amdsmi_get_gpu_accelerator_partition_profile(proc_handle, &active_profile, &num);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return RDC_ST_UNKNOWN_ERROR;
|
||||
}
|
||||
|
||||
// If physical device, use profile 0 to get all XCC's/Decoders
|
||||
uint32_t lookup_id =
|
||||
(info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) ? active_profile.profile_index : 0;
|
||||
|
||||
// Map rdc resource type to smi
|
||||
amdsmi_accelerator_partition_resource_type_t smi_resource;
|
||||
switch (resource_type) {
|
||||
case RDC_ACCELERATOR_XCC:
|
||||
smi_resource = AMDSMI_ACCELERATOR_XCC;
|
||||
break;
|
||||
case RDC_ACCELERATOR_DECODER:
|
||||
smi_resource = AMDSMI_ACCELERATOR_DECODER;
|
||||
break;
|
||||
default:
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
bool found = false;
|
||||
uint32_t total_resource = 0;
|
||||
uint32_t resource_share = 0;
|
||||
for (uint32_t i = 0; i < AMDSMI_MAX_CP_PROFILE_RESOURCES; i++) {
|
||||
const auto& res = config.resource_profiles[i];
|
||||
if (res.profile_index == lookup_id && res.resource_type == smi_resource) {
|
||||
total_resource = res.partition_resource;
|
||||
resource_share = res.num_partitions_share_resource;
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
return RDC_ST_UNKNOWN_ERROR;
|
||||
}
|
||||
profile->partition_resource = total_resource;
|
||||
profile->num_partitions_share_resource = resource_share;
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcPartitionImpl::rdc_get_num_partition_impl(uint32_t index, uint16_t* num_partition) {
|
||||
if (get_num_partition(index, num_partition) != AMDSMI_STATUS_SUCCESS) {
|
||||
return RDC_ST_UNKNOWN_ERROR;
|
||||
}
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
@@ -34,8 +34,8 @@ THE SOFTWARE.
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc_lib/RdcLogger.h"
|
||||
#include "rdc_lib/impl/RdcMetricFetcherImpl.h"
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
#include "rdc_lib/impl/SmiUtils.h"
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
@@ -209,6 +209,42 @@ rdc_status_t RdcWatchTableImpl::rdc_field_watch(rdc_gpu_group_t group_id,
|
||||
return result;
|
||||
}
|
||||
|
||||
// Check for rocprof fields in partitions
|
||||
rdc_group_info_t ginfo;
|
||||
result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo);
|
||||
if (result != RDC_ST_OK) {
|
||||
return result;
|
||||
}
|
||||
bool groupHasPartition = false;
|
||||
for (unsigned int i = 0; i < ginfo.count; i++) {
|
||||
uint32_t entityId = ginfo.entity_ids[i];
|
||||
rdc_entity_info_t info = rdc_get_info_from_entity_index(entityId);
|
||||
if (info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) {
|
||||
groupHasPartition = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
rdc_field_group_info_t field_info;
|
||||
result = group_settings_->rdc_group_field_get_info(field_group_id, &field_info);
|
||||
if (result != RDC_ST_OK) {
|
||||
return result;
|
||||
}
|
||||
bool groupHasRocprof = false;
|
||||
if (result == RDC_ST_OK) {
|
||||
for (unsigned int i = 0; i < field_info.count; i++) {
|
||||
rdc_field_t fid = field_info.field_ids[i];
|
||||
if (fid >= 800 && fid < 900) { // Rocprof fields in the 800's
|
||||
groupHasRocprof = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (groupHasPartition && groupHasRocprof) {
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
// See if any of the fields are notification fields, and
|
||||
// set them up, if so.
|
||||
result = notifications_->set_listen_events(fields_in_watch);
|
||||
@@ -381,30 +417,30 @@ rdc_status_t RdcWatchTableImpl::create_health_field_group(unsigned int component
|
||||
// set filed ids
|
||||
std::vector<rdc_field_t> field_ids{};
|
||||
if (components & RDC_HEALTH_WATCH_PCIE) {
|
||||
field_ids.push_back(RDC_HEALTH_PCIE_REPLAY_COUNT);
|
||||
field_ids.push_back(RDC_HEALTH_PCIE_REPLAY_COUNT);
|
||||
}
|
||||
|
||||
if (components & RDC_HEALTH_WATCH_XGMI) {
|
||||
field_ids.push_back(RDC_HEALTH_XGMI_ERROR);
|
||||
field_ids.push_back(RDC_HEALTH_XGMI_ERROR);
|
||||
}
|
||||
|
||||
if (components & RDC_HEALTH_WATCH_MEM) {
|
||||
field_ids.push_back(RDC_FI_ECC_UNCORRECT_TOTAL);
|
||||
field_ids.push_back(RDC_HEALTH_RETIRED_PAGE_NUM);
|
||||
field_ids.push_back(RDC_HEALTH_PENDING_PAGE_NUM);
|
||||
field_ids.push_back(RDC_HEALTH_RETIRED_PAGE_LIMIT);
|
||||
field_ids.push_back(RDC_FI_ECC_UNCORRECT_TOTAL);
|
||||
field_ids.push_back(RDC_HEALTH_RETIRED_PAGE_NUM);
|
||||
field_ids.push_back(RDC_HEALTH_PENDING_PAGE_NUM);
|
||||
field_ids.push_back(RDC_HEALTH_RETIRED_PAGE_LIMIT);
|
||||
}
|
||||
|
||||
if (components & RDC_HEALTH_WATCH_EEPROM) {
|
||||
field_ids.push_back(RDC_HEALTH_EEPROM_CONFIG_VALID);
|
||||
field_ids.push_back(RDC_HEALTH_EEPROM_CONFIG_VALID);
|
||||
}
|
||||
|
||||
if (components & RDC_HEALTH_WATCH_THERMAL) {
|
||||
field_ids.push_back(RDC_HEALTH_THERMAL_THROTTLE_TIME);
|
||||
field_ids.push_back(RDC_HEALTH_THERMAL_THROTTLE_TIME);
|
||||
}
|
||||
|
||||
if (components & RDC_HEALTH_WATCH_POWER) {
|
||||
field_ids.push_back(RDC_HEALTH_POWER_THROTTLE_TIME);
|
||||
field_ids.push_back(RDC_HEALTH_POWER_THROTTLE_TIME);
|
||||
}
|
||||
|
||||
if (0 == field_ids.size()) {
|
||||
@@ -417,8 +453,7 @@ rdc_status_t RdcWatchTableImpl::create_health_field_group(unsigned int component
|
||||
field_group_name.c_str(), field_group_id);
|
||||
}
|
||||
|
||||
rdc_status_t RdcWatchTableImpl::rdc_health_set(rdc_gpu_group_t group_id,
|
||||
unsigned int components) {
|
||||
rdc_status_t RdcWatchTableImpl::rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) {
|
||||
// remove old health for same group_id
|
||||
rdc_health_clear(group_id);
|
||||
|
||||
@@ -447,13 +482,11 @@ rdc_status_t RdcWatchTableImpl::rdc_health_set(rdc_gpu_group_t group_id,
|
||||
// get initial values
|
||||
rdc_field_value value;
|
||||
result = metric_fetcher_->fetch_smi_field(fields->first, fields->second, &value);
|
||||
if (result != RDC_ST_OK)
|
||||
break;
|
||||
if (result != RDC_ST_OK) break;
|
||||
|
||||
// set initial values to cache
|
||||
result = cache_mgr_->rdc_health_set(group_id, fields->first, value);
|
||||
if (result != RDC_ST_OK)
|
||||
break;
|
||||
if (result != RDC_ST_OK) break;
|
||||
}
|
||||
|
||||
// Start to watch the fields and update fields per 1 second.
|
||||
@@ -461,10 +494,8 @@ rdc_status_t RdcWatchTableImpl::rdc_health_set(rdc_gpu_group_t group_id,
|
||||
return result;
|
||||
}
|
||||
|
||||
rdc_status_t RdcWatchTableImpl::rdc_health_get(rdc_gpu_group_t group_id,
|
||||
unsigned int *components) {
|
||||
if (nullptr == components)
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
rdc_status_t RdcWatchTableImpl::rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components) {
|
||||
if (nullptr == components) return RDC_ST_BAD_PARAMETER;
|
||||
|
||||
std::lock_guard<std::mutex> guard(watch_mutex_);
|
||||
auto table_iter = health_watch_table_.find(group_id);
|
||||
@@ -478,23 +509,19 @@ rdc_status_t RdcWatchTableImpl::rdc_health_get(rdc_gpu_group_t group_id,
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
bool RdcWatchTableImpl::add_health_incident(uint32_t gpu_index,
|
||||
rdc_health_system_t component,
|
||||
rdc_health_result_t health,
|
||||
uint32_t err_code,
|
||||
std::string err_msg,
|
||||
rdc_health_incidents_t* incident,
|
||||
bool RdcWatchTableImpl::add_health_incident(uint32_t gpu_index, rdc_health_system_t component,
|
||||
rdc_health_result_t health, uint32_t err_code,
|
||||
std::string err_msg, rdc_health_incidents_t* incident,
|
||||
rdc_health_response_t* response) {
|
||||
bool result = false;
|
||||
|
||||
incident->gpu_index = gpu_index;
|
||||
incident->component = component;
|
||||
incident->health = health;
|
||||
incident->gpu_index = gpu_index;
|
||||
incident->component = component;
|
||||
incident->health = health;
|
||||
incident->error.code = err_code;
|
||||
strncpy_with_null(incident->error.msg, err_msg.c_str(), MAX_HEALTH_MSG_LENGTH);
|
||||
|
||||
if (incident->health > response->overall_health)
|
||||
response->overall_health = incident->health;
|
||||
if (incident->health > response->overall_health) response->overall_health = incident->health;
|
||||
response->incidents_count++;
|
||||
if (response->incidents_count >= HEALTH_MAX_ERROR_ITEMS) {
|
||||
RDC_LOG(RDC_INFO, "Health incidents are full!");
|
||||
@@ -504,24 +531,20 @@ bool RdcWatchTableImpl::add_health_incident(uint32_t gpu_index,
|
||||
return (result);
|
||||
}
|
||||
|
||||
rdc_status_t RdcWatchTableImpl::get_start_end_values(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index,
|
||||
rdc_field_t field,
|
||||
uint64_t start_timestamp,
|
||||
rdc_field_value *start_value,
|
||||
rdc_field_value *end_value) {
|
||||
if ((nullptr == start_value) && (nullptr == end_value))
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
rdc_status_t RdcWatchTableImpl::get_start_end_values(rdc_gpu_group_t group_id, uint32_t gpu_index,
|
||||
rdc_field_t field, uint64_t start_timestamp,
|
||||
rdc_field_value* start_value,
|
||||
rdc_field_value* end_value) {
|
||||
if ((nullptr == start_value) && (nullptr == end_value)) return RDC_ST_BAD_PARAMETER;
|
||||
|
||||
rdc_status_t result = RDC_ST_OK;
|
||||
if (nullptr != start_value) {
|
||||
//get the values of the field at the start_timestamp/end_timestampe
|
||||
result = cache_mgr_->rdc_health_get_values(group_id,
|
||||
gpu_index, field,
|
||||
start_timestamp, 0,
|
||||
start_value, nullptr);
|
||||
// get the values of the field at the start_timestamp/end_timestampe
|
||||
result = cache_mgr_->rdc_health_get_values(group_id, gpu_index, field, start_timestamp, 0,
|
||||
start_value, nullptr);
|
||||
if (result != RDC_ST_OK) {
|
||||
RDC_LOG(RDC_ERROR, "Error get gpu: " << gpu_index << " field: " << field << " history data. Return: " << result);
|
||||
RDC_LOG(RDC_ERROR, "Error get gpu: " << gpu_index << " field: " << field
|
||||
<< " history data. Return: " << result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
@@ -529,30 +552,25 @@ rdc_status_t RdcWatchTableImpl::get_start_end_values(rdc_gpu_group_t group_id,
|
||||
// get end values
|
||||
result = metric_fetcher_->fetch_smi_field(gpu_index, field, end_value);
|
||||
if (result != RDC_ST_OK)
|
||||
RDC_LOG(RDC_ERROR, "Error get gpu: " << gpu_index << " field: " << field << " current data. Return: " << result);
|
||||
RDC_LOG(RDC_ERROR, "Error get gpu: " << gpu_index << " field: " << field
|
||||
<< " current data. Return: " << result);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
rdc_status_t RdcWatchTableImpl::pcie_check(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index,
|
||||
rdc_status_t RdcWatchTableImpl::pcie_check(rdc_gpu_group_t group_id, uint32_t gpu_index,
|
||||
rdc_health_response_t* response) {
|
||||
//get field start/end values
|
||||
// get field start/end values
|
||||
rdc_field_value start = {}, end = {};
|
||||
uint64_t start_timestamp = static_cast<uint64_t>(time(nullptr) - 60) * 1000;
|
||||
//get the history data last 1 minute
|
||||
rdc_status_t result = get_start_end_values(group_id,
|
||||
gpu_index,
|
||||
RDC_HEALTH_PCIE_REPLAY_COUNT,
|
||||
start_timestamp,
|
||||
&start,
|
||||
&end);
|
||||
if (result != RDC_ST_OK)
|
||||
return result;
|
||||
// get the history data last 1 minute
|
||||
rdc_status_t result = get_start_end_values(group_id, gpu_index, RDC_HEALTH_PCIE_REPLAY_COUNT,
|
||||
start_timestamp, &start, &end);
|
||||
if (result != RDC_ST_OK) return result;
|
||||
|
||||
uint64_t pcie_replay_count = end.value.l_int - start.value.l_int;
|
||||
if (pcie_replay_count > PCIE_MAX_REPLAYS_PERMIN) {
|
||||
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
|
||||
rdc_health_incidents_t* incident = &response->incidents[response->incidents_count];
|
||||
|
||||
std::string err_msg = "Detected ";
|
||||
err_msg += std::to_string(pcie_replay_count);
|
||||
@@ -560,37 +578,26 @@ rdc_status_t RdcWatchTableImpl::pcie_check(rdc_gpu_group_t group_id,
|
||||
err_msg += std::to_string(PCIE_MAX_REPLAYS_PERMIN);
|
||||
err_msg += ".";
|
||||
|
||||
//add incident
|
||||
if (add_health_incident(gpu_index,
|
||||
RDC_HEALTH_WATCH_PCIE,
|
||||
RDC_HEALTH_RESULT_WARN,
|
||||
RDC_FR_PCI_REPLAY_RATE,
|
||||
err_msg,
|
||||
incident,
|
||||
response))
|
||||
// add incident
|
||||
if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_PCIE, RDC_HEALTH_RESULT_WARN,
|
||||
RDC_FR_PCI_REPLAY_RATE, err_msg, incident, response))
|
||||
return RDC_ST_MAX_LIMIT;
|
||||
}
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcWatchTableImpl::xgmi_check(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index,
|
||||
rdc_status_t RdcWatchTableImpl::xgmi_check(rdc_gpu_group_t group_id, uint32_t gpu_index,
|
||||
rdc_health_response_t* response) {
|
||||
//get field start/end values
|
||||
// get field start/end values
|
||||
rdc_field_value end = {};
|
||||
rdc_status_t result = get_start_end_values(group_id,
|
||||
gpu_index,
|
||||
RDC_HEALTH_XGMI_ERROR,
|
||||
0,
|
||||
nullptr,
|
||||
&end);
|
||||
if (result != RDC_ST_OK)
|
||||
return result;
|
||||
rdc_status_t result =
|
||||
get_start_end_values(group_id, gpu_index, RDC_HEALTH_XGMI_ERROR, 0, nullptr, &end);
|
||||
if (result != RDC_ST_OK) return result;
|
||||
|
||||
amdsmi_xgmi_status_t status = static_cast<amdsmi_xgmi_status_t>(end.value.l_int);
|
||||
if (AMDSMI_XGMI_STATUS_NO_ERRORS != status) {
|
||||
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
|
||||
rdc_health_incidents_t* incident = &response->incidents[response->incidents_count];
|
||||
|
||||
uint32_t err_code;
|
||||
std::string err_msg = "Detected ";
|
||||
@@ -603,106 +610,68 @@ rdc_status_t RdcWatchTableImpl::xgmi_check(rdc_gpu_group_t group_id,
|
||||
}
|
||||
err_msg += ".";
|
||||
|
||||
//add incident
|
||||
if (add_health_incident(gpu_index,
|
||||
RDC_HEALTH_WATCH_XGMI,
|
||||
RDC_HEALTH_RESULT_FAIL,
|
||||
err_code,
|
||||
err_msg,
|
||||
incident,
|
||||
response))
|
||||
// add incident
|
||||
if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_XGMI, RDC_HEALTH_RESULT_FAIL, err_code,
|
||||
err_msg, incident, response))
|
||||
return RDC_ST_MAX_LIMIT;
|
||||
}
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index,
|
||||
rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id, uint32_t gpu_index,
|
||||
rdc_health_response_t* response) {
|
||||
//get field start/end values
|
||||
rdc_field_value start= {}, end = {};
|
||||
rdc_status_t result = get_start_end_values(group_id,
|
||||
gpu_index,
|
||||
RDC_FI_ECC_UNCORRECT_TOTAL,
|
||||
0,
|
||||
nullptr,
|
||||
&end);
|
||||
if (result != RDC_ST_OK)
|
||||
return result;
|
||||
// get field start/end values
|
||||
rdc_field_value start = {}, end = {};
|
||||
rdc_status_t result =
|
||||
get_start_end_values(group_id, gpu_index, RDC_FI_ECC_UNCORRECT_TOTAL, 0, nullptr, &end);
|
||||
if (result != RDC_ST_OK) return result;
|
||||
|
||||
uint64_t ecc_uncorrectable_count = 0;
|
||||
ecc_uncorrectable_count = end.value.l_int;
|
||||
if (ecc_uncorrectable_count > 0) {
|
||||
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
|
||||
rdc_health_incidents_t* incident = &response->incidents[response->incidents_count];
|
||||
|
||||
std::string err_msg = "Detected ";
|
||||
err_msg += std::to_string(ecc_uncorrectable_count);
|
||||
err_msg += " uncorrectable ECC error(s) since last GPU reset.";
|
||||
|
||||
//add incident
|
||||
if (add_health_incident(gpu_index,
|
||||
RDC_HEALTH_WATCH_MEM,
|
||||
RDC_HEALTH_RESULT_FAIL,
|
||||
RDC_FR_ECC_UNCORRECTABLE_DETECTED,
|
||||
err_msg,
|
||||
incident,
|
||||
response))
|
||||
// add incident
|
||||
if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_MEM, RDC_HEALTH_RESULT_FAIL,
|
||||
RDC_FR_ECC_UNCORRECTABLE_DETECTED, err_msg, incident, response))
|
||||
return RDC_ST_MAX_LIMIT;
|
||||
}
|
||||
|
||||
result = get_start_end_values(group_id,
|
||||
gpu_index,
|
||||
RDC_HEALTH_PENDING_PAGE_NUM,
|
||||
0,
|
||||
nullptr,
|
||||
&end);
|
||||
if (result != RDC_ST_OK)
|
||||
return result;
|
||||
result = get_start_end_values(group_id, gpu_index, RDC_HEALTH_PENDING_PAGE_NUM, 0, nullptr, &end);
|
||||
if (result != RDC_ST_OK) return result;
|
||||
|
||||
uint64_t num_pages = end.value.l_int;
|
||||
if (num_pages > 0) {
|
||||
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
|
||||
rdc_health_incidents_t* incident = &response->incidents[response->incidents_count];
|
||||
|
||||
std::string err_msg = "Detected ";
|
||||
err_msg += std::to_string(num_pages);
|
||||
err_msg += " pending retired page(s).";
|
||||
|
||||
//add incident
|
||||
if (add_health_incident(gpu_index,
|
||||
RDC_HEALTH_WATCH_MEM,
|
||||
RDC_HEALTH_RESULT_WARN,
|
||||
RDC_FR_PENDING_PAGE_RETIREMENTS,
|
||||
err_msg,
|
||||
incident,
|
||||
response))
|
||||
// add incident
|
||||
if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_MEM, RDC_HEALTH_RESULT_WARN,
|
||||
RDC_FR_PENDING_PAGE_RETIREMENTS, err_msg, incident, response))
|
||||
return RDC_ST_MAX_LIMIT;
|
||||
}
|
||||
|
||||
//get retired page number
|
||||
result = get_start_end_values(group_id,
|
||||
gpu_index,
|
||||
RDC_HEALTH_RETIRED_PAGE_NUM,
|
||||
0,
|
||||
nullptr,
|
||||
&end);
|
||||
if (result != RDC_ST_OK)
|
||||
return result;
|
||||
// get retired page number
|
||||
result = get_start_end_values(group_id, gpu_index, RDC_HEALTH_RETIRED_PAGE_NUM, 0, nullptr, &end);
|
||||
if (result != RDC_ST_OK) return result;
|
||||
uint64_t retired_page = end.value.l_int;
|
||||
|
||||
//get retired page threshold
|
||||
result = get_start_end_values(group_id,
|
||||
gpu_index,
|
||||
RDC_HEALTH_RETIRED_PAGE_LIMIT,
|
||||
0,
|
||||
nullptr,
|
||||
&end);
|
||||
if (result != RDC_ST_OK)
|
||||
return result;
|
||||
// get retired page threshold
|
||||
result =
|
||||
get_start_end_values(group_id, gpu_index, RDC_HEALTH_RETIRED_PAGE_LIMIT, 0, nullptr, &end);
|
||||
if (result != RDC_ST_OK) return result;
|
||||
uint32_t retired_page_threshold = end.value.l_int;
|
||||
|
||||
if (retired_page > retired_page_threshold) {
|
||||
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
|
||||
rdc_health_incidents_t* incident = &response->incidents[response->incidents_count];
|
||||
|
||||
std::string err_msg = "Detected ";
|
||||
err_msg += std::to_string(retired_page);
|
||||
@@ -710,14 +679,9 @@ rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id,
|
||||
err_msg += std::to_string(retired_page_threshold);
|
||||
err_msg += ".";
|
||||
|
||||
//add incident
|
||||
if (add_health_incident(gpu_index,
|
||||
RDC_HEALTH_WATCH_MEM,
|
||||
RDC_HEALTH_RESULT_FAIL,
|
||||
RDC_FR_RETIRED_PAGES_LIMIT,
|
||||
err_msg,
|
||||
incident,
|
||||
response))
|
||||
// add incident
|
||||
if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_MEM, RDC_HEALTH_RESULT_FAIL,
|
||||
RDC_FR_RETIRED_PAGES_LIMIT, err_msg, incident, response))
|
||||
return RDC_ST_MAX_LIMIT;
|
||||
|
||||
return RDC_ST_OK;
|
||||
@@ -725,31 +689,22 @@ rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id,
|
||||
|
||||
if (retired_page > 0) {
|
||||
uint64_t start_timestamp = static_cast<uint64_t>(time(nullptr) - 604800) * 1000;
|
||||
//get retired page number last 1 week
|
||||
result = get_start_end_values(group_id,
|
||||
gpu_index,
|
||||
RDC_HEALTH_RETIRED_PAGE_NUM,
|
||||
start_timestamp,
|
||||
&start,
|
||||
&end);
|
||||
if (result != RDC_ST_OK)
|
||||
return result;
|
||||
// get retired page number last 1 week
|
||||
result = get_start_end_values(group_id, gpu_index, RDC_HEALTH_RETIRED_PAGE_NUM, start_timestamp,
|
||||
&start, &end);
|
||||
if (result != RDC_ST_OK) return result;
|
||||
|
||||
retired_page = end.value.l_int - start.value.l_int;
|
||||
if (retired_page > 1) {
|
||||
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
|
||||
rdc_health_incidents_t* incident = &response->incidents[response->incidents_count];
|
||||
|
||||
std::string err_msg = "Detected ";
|
||||
err_msg += std::to_string(retired_page);
|
||||
err_msg += " retired pages more than one in the last week.";
|
||||
|
||||
//add incident
|
||||
if (add_health_incident(gpu_index,
|
||||
RDC_HEALTH_WATCH_MEM,
|
||||
RDC_HEALTH_RESULT_FAIL,
|
||||
RDC_FR_RETIRED_PAGES_UNCORRECTABLE_LIMIT,
|
||||
err_msg,
|
||||
incident,
|
||||
// add incident
|
||||
if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_MEM, RDC_HEALTH_RESULT_FAIL,
|
||||
RDC_FR_RETIRED_PAGES_UNCORRECTABLE_LIMIT, err_msg, incident,
|
||||
response))
|
||||
return RDC_ST_MAX_LIMIT;
|
||||
}
|
||||
@@ -758,194 +713,150 @@ rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id,
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcWatchTableImpl::eeprom_check(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index,
|
||||
rdc_status_t RdcWatchTableImpl::eeprom_check(rdc_gpu_group_t group_id, uint32_t gpu_index,
|
||||
rdc_health_response_t* response) {
|
||||
rdc_field_value end = {};
|
||||
rdc_status_t result = get_start_end_values(group_id,
|
||||
gpu_index,
|
||||
RDC_FI_ECC_UNCORRECT_TOTAL,
|
||||
0,
|
||||
nullptr,
|
||||
&end);
|
||||
if (result != RDC_ST_OK && result != RDC_ST_CORRUPTED_EEPROM)
|
||||
return result;
|
||||
rdc_status_t result =
|
||||
get_start_end_values(group_id, gpu_index, RDC_FI_ECC_UNCORRECT_TOTAL, 0, nullptr, &end);
|
||||
if (result != RDC_ST_OK && result != RDC_ST_CORRUPTED_EEPROM) return result;
|
||||
|
||||
if (result == RDC_ST_CORRUPTED_EEPROM) {
|
||||
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
|
||||
rdc_health_incidents_t* incident = &response->incidents[response->incidents_count];
|
||||
|
||||
std::string err_msg = "Detected a corrupt EEPROM since last GPU reset.";
|
||||
|
||||
//add incident
|
||||
if (add_health_incident(gpu_index,
|
||||
RDC_HEALTH_WATCH_EEPROM,
|
||||
RDC_HEALTH_RESULT_WARN,
|
||||
RDC_FR_CORRUPT_EEPROM,
|
||||
err_msg,
|
||||
incident,
|
||||
response))
|
||||
// add incident
|
||||
if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_EEPROM, RDC_HEALTH_RESULT_WARN,
|
||||
RDC_FR_CORRUPT_EEPROM, err_msg, incident, response))
|
||||
return RDC_ST_MAX_LIMIT;
|
||||
}
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcWatchTableImpl::thermal_check(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index,
|
||||
rdc_status_t RdcWatchTableImpl::thermal_check(rdc_gpu_group_t group_id, uint32_t gpu_index,
|
||||
rdc_health_response_t* response) {
|
||||
//get field start/end values
|
||||
// get field start/end values
|
||||
rdc_field_value start = {}, end = {};
|
||||
uint64_t start_timestamp = static_cast<uint64_t>(time(nullptr) - 60) * 1000;
|
||||
//get the history data last 1 minute
|
||||
rdc_status_t result = get_start_end_values(group_id,
|
||||
gpu_index,
|
||||
RDC_HEALTH_THERMAL_THROTTLE_TIME,
|
||||
start_timestamp,
|
||||
&start,
|
||||
&end);
|
||||
if (result != RDC_ST_OK)
|
||||
return result;
|
||||
// get the history data last 1 minute
|
||||
rdc_status_t result = get_start_end_values(group_id, gpu_index, RDC_HEALTH_THERMAL_THROTTLE_TIME,
|
||||
start_timestamp, &start, &end);
|
||||
if (result != RDC_ST_OK) return result;
|
||||
|
||||
uint64_t acc_socket_thrm = end.value.l_int - start.value.l_int;
|
||||
if (0 < acc_socket_thrm) {
|
||||
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
|
||||
rdc_health_incidents_t* incident = &response->incidents[response->incidents_count];
|
||||
|
||||
std::string err_msg = "Detected ";
|
||||
err_msg += std::to_string(acc_socket_thrm);
|
||||
err_msg += " clock throttling due to thermal violation in the last minute.";
|
||||
|
||||
//add incident
|
||||
if (add_health_incident(gpu_index,
|
||||
RDC_HEALTH_WATCH_THERMAL,
|
||||
RDC_HEALTH_RESULT_WARN,
|
||||
RDC_FR_CLOCKS_THROTTLE_THERMAL,
|
||||
err_msg,
|
||||
incident,
|
||||
response))
|
||||
// add incident
|
||||
if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_THERMAL, RDC_HEALTH_RESULT_WARN,
|
||||
RDC_FR_CLOCKS_THROTTLE_THERMAL, err_msg, incident, response))
|
||||
return RDC_ST_MAX_LIMIT;
|
||||
}
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcWatchTableImpl::power_check(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index,
|
||||
rdc_status_t RdcWatchTableImpl::power_check(rdc_gpu_group_t group_id, uint32_t gpu_index,
|
||||
rdc_health_response_t* response) {
|
||||
//get field start/end values
|
||||
// get field start/end values
|
||||
rdc_field_value start = {}, end = {};
|
||||
uint64_t start_timestamp = static_cast<uint64_t>(time(nullptr) - 60) * 1000;
|
||||
//get the history data last 1 minute
|
||||
rdc_status_t result = get_start_end_values(group_id,
|
||||
gpu_index,
|
||||
RDC_HEALTH_POWER_THROTTLE_TIME,
|
||||
start_timestamp,
|
||||
&start,
|
||||
&end);
|
||||
if (result != RDC_ST_OK)
|
||||
return result;
|
||||
// get the history data last 1 minute
|
||||
rdc_status_t result = get_start_end_values(group_id, gpu_index, RDC_HEALTH_POWER_THROTTLE_TIME,
|
||||
start_timestamp, &start, &end);
|
||||
if (result != RDC_ST_OK) return result;
|
||||
|
||||
uint64_t acc_ppt_pwr = end.value.l_int - start.value.l_int;
|
||||
if (0 < acc_ppt_pwr) {
|
||||
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
|
||||
rdc_health_incidents_t* incident = &response->incidents[response->incidents_count];
|
||||
|
||||
std::string err_msg = "Detected ";
|
||||
err_msg += std::to_string(acc_ppt_pwr);
|
||||
err_msg += " Detected clock throttling due to power violation in the last minute.";
|
||||
|
||||
//add incident
|
||||
if (add_health_incident(gpu_index,
|
||||
RDC_HEALTH_WATCH_POWER,
|
||||
RDC_HEALTH_RESULT_WARN,
|
||||
RDC_FR_CLOCKS_THROTTLE_POWER,
|
||||
err_msg,
|
||||
incident,
|
||||
response))
|
||||
// add incident
|
||||
if (add_health_incident(gpu_index, RDC_HEALTH_WATCH_POWER, RDC_HEALTH_RESULT_WARN,
|
||||
RDC_FR_CLOCKS_THROTTLE_POWER, err_msg, incident, response))
|
||||
return RDC_ST_MAX_LIMIT;
|
||||
}
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
rdc_status_t RdcWatchTableImpl::rdc_health_check(rdc_gpu_group_t group_id,
|
||||
rdc_health_response_t *response) {
|
||||
if (nullptr == response)
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
rdc_health_response_t* response) {
|
||||
if (nullptr == response) return RDC_ST_BAD_PARAMETER;
|
||||
|
||||
unsigned int components = 0;
|
||||
std::vector<RdcFieldKey> fields_in_watch;
|
||||
do { //< lock guard for thread safe
|
||||
std::lock_guard<std::mutex> guard(watch_mutex_);
|
||||
auto health = health_watch_table_.find(group_id);
|
||||
if (health == health_watch_table_.end())
|
||||
return RDC_ST_NOT_FOUND;
|
||||
if (health == health_watch_table_.end()) return RDC_ST_NOT_FOUND;
|
||||
components = health->second.components;
|
||||
fields_in_watch = health->second.fields;
|
||||
} while (0);
|
||||
|
||||
rdc_group_info_t ginfo;
|
||||
rdc_status_t result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo);
|
||||
if (result != RDC_ST_OK)
|
||||
return result;
|
||||
if (result != RDC_ST_OK) return result;
|
||||
|
||||
for (auto fields = fields_in_watch.begin(); fields != fields_in_watch.end(); fields++) {
|
||||
// get current values
|
||||
rdc_field_value value;
|
||||
result = metric_fetcher_->fetch_smi_field(fields->first, fields->second, &value);
|
||||
if (result != RDC_ST_OK)
|
||||
break;
|
||||
if (result != RDC_ST_OK) break;
|
||||
|
||||
// set current values to cache
|
||||
result = cache_mgr_->rdc_update_health_stats(group_id, fields->first, value);
|
||||
if (result != RDC_ST_OK)
|
||||
break;
|
||||
if (result != RDC_ST_OK) break;
|
||||
}
|
||||
|
||||
//init response
|
||||
// init response
|
||||
response->overall_health = RDC_HEALTH_RESULT_PASS;
|
||||
response->incidents_count = 0;
|
||||
|
||||
for (uint32_t gindex = 0; gindex < ginfo.count; gindex++) {
|
||||
//PCIe
|
||||
// PCIe
|
||||
if (components & RDC_HEALTH_WATCH_PCIE) {
|
||||
result = pcie_check(group_id, ginfo.entity_ids[gindex], response);
|
||||
if (result == RDC_ST_MAX_LIMIT)
|
||||
return result;
|
||||
if (result == RDC_ST_MAX_LIMIT) return result;
|
||||
}
|
||||
|
||||
//XGMI
|
||||
// XGMI
|
||||
if (components & RDC_HEALTH_WATCH_XGMI) {
|
||||
result = xgmi_check(group_id, ginfo.entity_ids[gindex], response);
|
||||
if (result == RDC_ST_MAX_LIMIT)
|
||||
return result;
|
||||
if (result == RDC_ST_MAX_LIMIT) return result;
|
||||
}
|
||||
|
||||
//Memory
|
||||
// Memory
|
||||
if (components & RDC_HEALTH_WATCH_MEM) {
|
||||
result = memory_check(group_id, ginfo.entity_ids[gindex], response);
|
||||
if (result == RDC_ST_MAX_LIMIT)
|
||||
return result;
|
||||
if (result == RDC_ST_MAX_LIMIT) return result;
|
||||
}
|
||||
|
||||
//EEPROM
|
||||
// EEPROM
|
||||
if (components & RDC_HEALTH_WATCH_EEPROM) {
|
||||
result = eeprom_check(group_id, ginfo.entity_ids[gindex], response);
|
||||
if (result == RDC_ST_MAX_LIMIT)
|
||||
return result;
|
||||
if (result == RDC_ST_MAX_LIMIT) return result;
|
||||
}
|
||||
|
||||
//Thermal
|
||||
// Thermal
|
||||
if (components & RDC_HEALTH_WATCH_THERMAL) {
|
||||
result = thermal_check(group_id, ginfo.entity_ids[gindex], response);
|
||||
if (result == RDC_ST_MAX_LIMIT)
|
||||
return result;
|
||||
if (result == RDC_ST_MAX_LIMIT) return result;
|
||||
}
|
||||
|
||||
//Power
|
||||
// Power
|
||||
if (components & RDC_HEALTH_WATCH_POWER) {
|
||||
result = power_check(group_id, ginfo.entity_ids[gindex], response);
|
||||
if (result == RDC_ST_MAX_LIMIT)
|
||||
return result;
|
||||
if (result == RDC_ST_MAX_LIMIT) return result;
|
||||
}
|
||||
} //end of for gindex
|
||||
} // end of for gindex
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
@@ -953,7 +864,7 @@ rdc_status_t RdcWatchTableImpl::rdc_health_check(rdc_gpu_group_t group_id,
|
||||
rdc_status_t RdcWatchTableImpl::rdc_health_clear(rdc_gpu_group_t group_id) {
|
||||
rdc_field_grp_t field_group_id;
|
||||
|
||||
do { //< lock guard for thread safe
|
||||
do { //< lock guard for thread safe
|
||||
std::lock_guard<std::mutex> guard(watch_mutex_);
|
||||
auto health = health_watch_table_.find(group_id);
|
||||
if (health == health_watch_table_.end()) {
|
||||
@@ -1219,8 +1130,8 @@ void RdcWatchTableImpl::debug_status() {
|
||||
for (const auto& p : hite->second.fields) {
|
||||
strstream << "<" << p.first << "," << p.second << "> ";
|
||||
}
|
||||
RDC_LOG(RDC_DEBUG,
|
||||
"group id : " << hite->first << " components : " << hite->second.components << " fields : " << strstream.str());
|
||||
RDC_LOG(RDC_DEBUG, "group id : " << hite->first << " components : " << hite->second.components
|
||||
<< " fields : " << strstream.str());
|
||||
}
|
||||
|
||||
if (fields_to_watch_.size() > 0) {
|
||||
|
||||
@@ -23,6 +23,7 @@ THE SOFTWARE.
|
||||
#include "rdc_lib/impl/SmiUtils.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
@@ -79,44 +80,59 @@ rdc_status_t Smi2RdcError(amdsmi_status_t rsmi) {
|
||||
|
||||
amdsmi_status_t get_processor_handle_from_id(uint32_t gpu_id,
|
||||
amdsmi_processor_handle* processor_handle) {
|
||||
uint32_t socket_count;
|
||||
uint32_t processor_count;
|
||||
auto ret = amdsmi_get_socket_handles(&socket_count, nullptr);
|
||||
uint32_t socket_count = 0;
|
||||
amdsmi_status_t ret = amdsmi_get_socket_handles(&socket_count, nullptr);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
std::vector<amdsmi_socket_handle> sockets(socket_count);
|
||||
std::vector<amdsmi_processor_handle> all_processors{};
|
||||
ret = amdsmi_get_socket_handles(&socket_count, sockets.data());
|
||||
for (auto& socket : sockets) {
|
||||
ret = amdsmi_get_processor_handles(socket, &processor_count, nullptr);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
std::vector<amdsmi_processor_handle> processors(processor_count);
|
||||
ret = amdsmi_get_processor_handles(socket, &processor_count, processors.data());
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
for (auto& processor : processors) {
|
||||
processor_type_t processor_type = {};
|
||||
ret = amdsmi_get_processor_type(processor, &processor_type);
|
||||
if (processor_type != AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
|
||||
RDC_LOG(RDC_ERROR, "Expect AMD_GPU device type!");
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
all_processors.push_back(processor);
|
||||
}
|
||||
std::vector<amdsmi_socket_handle> sockets(socket_count);
|
||||
ret = amdsmi_get_socket_handles(&socket_count, sockets.data());
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (gpu_id >= all_processors.size()) {
|
||||
std::vector<std::vector<amdsmi_processor_handle>> procs_by_socket;
|
||||
procs_by_socket.resize(socket_count);
|
||||
|
||||
for (size_t s = 0; s < sockets.size(); s++) {
|
||||
uint32_t proc_count = 0;
|
||||
ret = amdsmi_get_processor_handles(sockets[s], &proc_count, nullptr);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::vector<amdsmi_processor_handle> procs(proc_count);
|
||||
ret = amdsmi_get_processor_handles(sockets[s], &proc_count, procs.data());
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
for (auto& proc : procs) {
|
||||
processor_type_t proc_type = {};
|
||||
ret = amdsmi_get_processor_type(proc, &proc_type);
|
||||
if (proc_type != AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
}
|
||||
|
||||
procs_by_socket[s] = procs;
|
||||
}
|
||||
|
||||
rdc_entity_info_t info = rdc_get_info_from_entity_index(gpu_id);
|
||||
uint32_t socket_index = info.device_index;
|
||||
uint32_t instance_index = info.instance_index;
|
||||
|
||||
if (socket_index >= procs_by_socket.size()) {
|
||||
return AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS;
|
||||
}
|
||||
|
||||
// Get processor handle from GPU id
|
||||
*processor_handle = all_processors[gpu_id];
|
||||
const auto& handles = procs_by_socket[socket_index];
|
||||
if (instance_index >= handles.size()) {
|
||||
return AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS;
|
||||
}
|
||||
|
||||
*processor_handle = handles[instance_index];
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -141,5 +157,69 @@ amdsmi_status_t get_processor_count(uint32_t& all_processor_count) {
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
amdsmi_status_t get_socket_handles(std::vector<amdsmi_socket_handle>& sockets) {
|
||||
uint32_t socket_count = 0;
|
||||
amdsmi_status_t ret = amdsmi_get_socket_handles(&socket_count, nullptr);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
sockets.resize(socket_count);
|
||||
|
||||
ret = amdsmi_get_socket_handles(&socket_count, sockets.data());
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
amdsmi_status_t get_processor_handles(amdsmi_socket_handle socket,
|
||||
std::vector<amdsmi_processor_handle>& processors) {
|
||||
uint32_t processor_count = 0;
|
||||
amdsmi_status_t ret = amdsmi_get_processor_handles(socket, &processor_count, nullptr);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
processors.resize(processor_count);
|
||||
|
||||
ret = amdsmi_get_processor_handles(socket, &processor_count, processors.data());
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
amdsmi_status_t get_kfd_partition_id(amdsmi_processor_handle proc, uint32_t* partition_id) {
|
||||
amdsmi_kfd_info_t kfd_info = {};
|
||||
amdsmi_status_t ret = amdsmi_get_gpu_kfd_info(proc, &kfd_info);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
*partition_id = kfd_info.current_partition_id;
|
||||
return ret;
|
||||
}
|
||||
|
||||
amdsmi_status_t get_metrics_info(amdsmi_processor_handle proc, amdsmi_gpu_metrics_t* metrics) {
|
||||
amdsmi_status_t ret = amdsmi_get_gpu_metrics_info(proc, metrics);
|
||||
return ret;
|
||||
}
|
||||
|
||||
amdsmi_status_t get_num_partition(uint32_t index, uint16_t* num_partition) {
|
||||
// Get the processor handle for the physical device.
|
||||
amdsmi_processor_handle proc_handle;
|
||||
amdsmi_status_t ret = get_processor_handle_from_id(index, &proc_handle);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
amdsmi_gpu_metrics_t metrics;
|
||||
memset(&metrics, 0, sizeof(metrics));
|
||||
ret = get_metrics_info(proc_handle, &metrics);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
*num_partition = metrics.num_partition;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
@@ -1075,5 +1075,41 @@ rdc_status_t RdcStandaloneHandler::rdc_link_status_get(rdc_link_status_t* result
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcStandaloneHandler::rdc_get_num_partition(uint32_t index, uint16_t* num_partition) {
|
||||
::rdc::GetNumPartitionRequest request;
|
||||
request.set_gpu_index(index);
|
||||
::rdc::GetNumPartitionResponse reply;
|
||||
::grpc::ClientContext context;
|
||||
|
||||
::grpc::Status status = stub_->GetNumPartition(&context, request, &reply);
|
||||
rdc_status_t err_status = error_handle(status, reply.status());
|
||||
if (err_status != RDC_ST_OK) {
|
||||
return err_status;
|
||||
}
|
||||
*num_partition = reply.num_partition();
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcStandaloneHandler::rdc_instance_profile_get(
|
||||
uint32_t entity_index, rdc_instance_resource_type_t resource_type,
|
||||
rdc_resource_profile_t* profile) {
|
||||
::rdc::GetInstanceProfileRequest request;
|
||||
request.set_entity_index(entity_index);
|
||||
request.set_resource_type(static_cast<uint32_t>(resource_type));
|
||||
|
||||
::rdc::GetInstanceProfileResponse reply;
|
||||
::grpc::ClientContext context;
|
||||
|
||||
::grpc::Status status = stub_->GetInstanceProfile(&context, request, &reply);
|
||||
rdc_status_t err_status = error_handle(status, reply.status());
|
||||
if (err_status != RDC_ST_OK) {
|
||||
return err_status;
|
||||
}
|
||||
|
||||
profile->partition_resource = reply.partition_resource();
|
||||
profile->num_partitions_share_resource = reply.num_partitions_share_resource();
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
@@ -56,7 +56,8 @@ set(INC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/include")
|
||||
|
||||
include_directories(${INC_DIR} ${PROJECT_SOURCE_DIR}/include
|
||||
"${GRPC_ROOT}/include"
|
||||
${PROJECT_SOURCE_DIR})
|
||||
${PROJECT_SOURCE_DIR}
|
||||
${AMD_SMI_INCLUDE_DIR})
|
||||
|
||||
set(RDCI_SRC_LIST
|
||||
"${COMMON_DIR}/rdc_fields_supported.cc"
|
||||
|
||||
@@ -37,7 +37,9 @@ class RdciDiscoverySubSystem : public RdciSubSystem {
|
||||
bool show_help_;
|
||||
void show_help() const;
|
||||
bool is_list_;
|
||||
bool is_partition_;
|
||||
void show_attributes();
|
||||
void show_attributes_with_partitions();
|
||||
bool show_version_;
|
||||
void show_version();
|
||||
};
|
||||
|
||||
@@ -43,6 +43,9 @@ class RdciDmonSubSystem : public RdciSubSystem {
|
||||
void show_field_usage() const;
|
||||
void clean_up();
|
||||
|
||||
// Need to resolve gpu indexes after process is called
|
||||
void resolve_gpu_indexes();
|
||||
|
||||
void create_temp_group();
|
||||
void create_temp_field_group();
|
||||
|
||||
@@ -64,6 +67,7 @@ class RdciDmonSubSystem : public RdciSubSystem {
|
||||
|
||||
std::map<OPTIONS, uint32_t> options_;
|
||||
std::vector<rdc_field_t> field_ids_;
|
||||
std::string raw_gpu_indexes_;
|
||||
std::vector<uint32_t> gpu_indexes_;
|
||||
bool need_cleanup_;
|
||||
uint64_t latest_time_stamp_;
|
||||
|
||||
@@ -24,6 +24,10 @@ THE SOFTWARE.
|
||||
#include <getopt.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <cstring>
|
||||
#include <iomanip>
|
||||
#include <set>
|
||||
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc/rdc_private.h"
|
||||
#include "rdc_lib/RdcException.h"
|
||||
@@ -33,22 +37,23 @@ namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
RdciDiscoverySubSystem::RdciDiscoverySubSystem()
|
||||
: show_help_(false),
|
||||
is_list_(false),
|
||||
show_version_(false) {}
|
||||
: show_help_(false), is_list_(false), is_partition_(false), show_version_(false) {}
|
||||
|
||||
void RdciDiscoverySubSystem::parse_cmd_opts(int argc, char** argv) {
|
||||
const int HOST_OPTIONS = 1000;
|
||||
const int JSON_OPTIONS = 1001;
|
||||
const struct option long_options[] = {
|
||||
{"host", required_argument, nullptr, HOST_OPTIONS}, {"help", optional_argument, nullptr, 'h'},
|
||||
{"unauth", optional_argument, nullptr, 'u'}, {"list", optional_argument, nullptr, 'l'},
|
||||
{"json", optional_argument, nullptr, JSON_OPTIONS}, {"version", optional_argument, nullptr, 'v'}, {nullptr, 0, nullptr, 0}};
|
||||
const struct option long_options[] = {{"host", required_argument, nullptr, HOST_OPTIONS},
|
||||
{"help", optional_argument, nullptr, 'h'},
|
||||
{"unauth", optional_argument, nullptr, 'u'},
|
||||
{"list", optional_argument, nullptr, 'l'},
|
||||
{"json", optional_argument, nullptr, JSON_OPTIONS},
|
||||
{"version", optional_argument, nullptr, 'v'},
|
||||
{nullptr, 0, nullptr, 0}};
|
||||
|
||||
int option_index = 0;
|
||||
int opt = 0;
|
||||
|
||||
while ((opt = getopt_long(argc, argv, "hluv", long_options, &option_index)) != -1) {
|
||||
while ((opt = getopt_long(argc, argv, "hliuv", long_options, &option_index)) != -1) {
|
||||
switch (opt) {
|
||||
case HOST_OPTIONS:
|
||||
ip_port_ = optarg;
|
||||
@@ -65,6 +70,9 @@ void RdciDiscoverySubSystem::parse_cmd_opts(int argc, char** argv) {
|
||||
case 'l':
|
||||
is_list_ = true;
|
||||
break;
|
||||
case 'i':
|
||||
is_partition_ = true;
|
||||
break;
|
||||
case 'v':
|
||||
show_version_ = true;
|
||||
break;
|
||||
@@ -74,9 +82,10 @@ void RdciDiscoverySubSystem::parse_cmd_opts(int argc, char** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
if ((!is_list_ && !show_version_) || (is_list_ && show_version_)) {
|
||||
int opCount = (is_list_ ? 1 : 0) + (is_partition_ ? 1 : 0) + (show_version_ ? 1 : 0);
|
||||
if (opCount != 1) {
|
||||
show_help();
|
||||
throw RdcException(RDC_ST_BAD_PARAMETER, "Need to specify operations");
|
||||
throw RdcException(RDC_ST_BAD_PARAMETER, "Need to specify exactly one operation");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -93,6 +102,8 @@ void RdciDiscoverySubSystem::show_help() const {
|
||||
<< "Output using json.\n";
|
||||
std::cout << " -l --list list GPU discovered"
|
||||
<< " on the system\n";
|
||||
std::cout << " -i --gpu-instance list GPU discovered"
|
||||
<< " on the system with partitions\n";
|
||||
std::cout << " -v --version Display version information of the"
|
||||
<< " the server and libraries used by the server\n";
|
||||
}
|
||||
@@ -108,7 +119,7 @@ void RdciDiscoverySubSystem::show_attributes() {
|
||||
if (is_json_output()) {
|
||||
std::cout << "\"gpus\" : [], \"status\": \"ok\"";
|
||||
} else {
|
||||
std::cout << "No GPUs find on the system\n";
|
||||
std::cout << "No GPUs found on the system\n";
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -145,6 +156,145 @@ void RdciDiscoverySubSystem::show_attributes() {
|
||||
}
|
||||
}
|
||||
|
||||
void RdciDiscoverySubSystem::show_attributes_with_partitions() {
|
||||
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES];
|
||||
uint32_t count = 0;
|
||||
rdc_status_t result = rdc_device_get_all(rdc_handle_, gpu_index_list, &count);
|
||||
if (result != RDC_ST_OK) {
|
||||
throw RdcException(result, "Fail to get all devices");
|
||||
}
|
||||
|
||||
if (count == 0) {
|
||||
if (is_json_output())
|
||||
std::cout << "\"gpus\" : [], \"status\": \"ok\"";
|
||||
else
|
||||
std::cout << "No GPUs found on the system\n";
|
||||
return;
|
||||
}
|
||||
|
||||
// Print header.
|
||||
if (!is_json_output()) {
|
||||
std::cout << count << " GPUs found." << std::endl;
|
||||
std::cout << "---------------------------------------------------------------------"
|
||||
<< std::endl;
|
||||
std::cout << std::setw(12) << std::left << "GPU Index" << std::setw(20) << "Instance Index"
|
||||
<< std::setw(25) << "Device Information" << std::setw(8) << "XCC" << std::setw(8)
|
||||
<< "DECODER" << std::endl;
|
||||
} else {
|
||||
std::cout << "\"gpus\" : [";
|
||||
}
|
||||
|
||||
// Loop over each GPU.
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
rdc_device_attributes_t attribute;
|
||||
result = rdc_device_get_attributes(rdc_handle_, gpu_index_list[i], &attribute);
|
||||
if (result != RDC_ST_OK) return;
|
||||
|
||||
// Build physical device entity info.
|
||||
rdc_entity_info_t phys_info;
|
||||
phys_info.device_index = i;
|
||||
phys_info.instance_index = 0;
|
||||
phys_info.entity_role = RDC_DEVICE_ROLE_PHYSICAL;
|
||||
phys_info.device_type = RDC_DEVICE_TYPE_GPU;
|
||||
uint32_t phys_entity_index = rdc_get_entity_index_from_info(phys_info);
|
||||
|
||||
rdc_resource_profile_t phys_xcc = {};
|
||||
rdc_resource_profile_t phys_decoder_profile = {};
|
||||
result =
|
||||
rdc_instance_profile_get(rdc_handle_, phys_entity_index, RDC_ACCELERATOR_XCC, &phys_xcc);
|
||||
result = rdc_instance_profile_get(rdc_handle_, phys_entity_index, RDC_ACCELERATOR_DECODER,
|
||||
&phys_decoder_profile);
|
||||
|
||||
std::string phys_xcc_str = std::to_string(phys_xcc.partition_resource);
|
||||
std::string phys_decoder_str = std::to_string(phys_decoder_profile.partition_resource);
|
||||
|
||||
if (!is_json_output()) {
|
||||
std::cout << std::setw(12) << std::left << i << std::setw(20) << "" << std::setw(25)
|
||||
<< attribute.device_name << std::setw(8) << phys_xcc_str << std::setw(8)
|
||||
<< phys_decoder_str << std::endl;
|
||||
} else {
|
||||
std::cout << "{\"gpu_index\": \"" << i << "\", "
|
||||
<< "\"device_name\": \"" << attribute.device_name << "\", "
|
||||
<< "\"physical\": {"
|
||||
<< "\"xcc\": \"" << phys_xcc_str << "\", "
|
||||
<< "\"decoder\": \"" << phys_decoder_str << "\" "
|
||||
<< "}";
|
||||
}
|
||||
|
||||
uint16_t num_partition = 0;
|
||||
rdc_status_t result = rdc_get_num_partition(rdc_handle_, i, &num_partition);
|
||||
if (result != RDC_ST_OK) {
|
||||
return;
|
||||
}
|
||||
|
||||
// A partitionable device not in partitionable mode will have metrics.num_partition=1
|
||||
// Where as, a non-partitionable device will have metrics.num_partition = UINT16_MAX
|
||||
if (num_partition != UINT16_MAX && num_partition > 1) {
|
||||
if (is_json_output()) {
|
||||
std::cout << ", \"partitions\": [";
|
||||
}
|
||||
for (uint32_t pid = 0; pid < num_partition; pid++) {
|
||||
std::string instance_str = "g" + std::to_string(i) + "." + std::to_string(pid);
|
||||
|
||||
rdc_entity_info_t part_info;
|
||||
part_info.device_index = i;
|
||||
part_info.instance_index = pid;
|
||||
part_info.entity_role = RDC_DEVICE_ROLE_PARTITION_INSTANCE;
|
||||
part_info.device_type = RDC_DEVICE_TYPE_GPU;
|
||||
uint32_t part_entity_index = rdc_get_entity_index_from_info(part_info);
|
||||
|
||||
rdc_resource_profile_t part_xcc = {};
|
||||
rdc_resource_profile_t part_decoder = {};
|
||||
result = rdc_instance_profile_get(rdc_handle_, part_entity_index, RDC_ACCELERATOR_XCC,
|
||||
&part_xcc);
|
||||
result = rdc_instance_profile_get(rdc_handle_, part_entity_index, RDC_ACCELERATOR_DECODER,
|
||||
&part_decoder);
|
||||
|
||||
std::string part_decoder_str = std::to_string(part_decoder.partition_resource);
|
||||
std::string part_xcc_str = std::to_string(part_xcc.partition_resource);
|
||||
std::string starColumn = " ";
|
||||
if (part_decoder.num_partitions_share_resource > 1) {
|
||||
starColumn = "*";
|
||||
}
|
||||
|
||||
if (!is_json_output()) {
|
||||
std::cout << std::setw(12) << "" << std::setw(20) << instance_str << std::setw(25) << ""
|
||||
<< std::setw(7) << part_xcc_str << std::setw(1) << starColumn << std::setw(8)
|
||||
<< part_decoder_str << std::endl;
|
||||
} else {
|
||||
std::string decoder_shared =
|
||||
(part_decoder.num_partitions_share_resource > 1) ? "true" : "false";
|
||||
std::cout << "{\"instance_index\": \"" << instance_str << "\", "
|
||||
<< "\"xcc\": \"" << part_xcc_str << "\", "
|
||||
<< "\"decoder\": \"" << part_decoder_str << "\", "
|
||||
<< "\"decoder_shared\": " << decoder_shared << "}";
|
||||
|
||||
if (pid != num_partition - 1) {
|
||||
std::cout << ",";
|
||||
} else {
|
||||
std::cout << "]";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (is_json_output()) {
|
||||
if (i != count - 1)
|
||||
std::cout << "},";
|
||||
else
|
||||
std::cout << "}";
|
||||
}
|
||||
}
|
||||
|
||||
if (!is_json_output()) {
|
||||
std::cout << "---------------------------------------------------------------------"
|
||||
<< std::endl;
|
||||
std::cout << "* if the resource is shared" << std::endl;
|
||||
} else {
|
||||
std::cout << ']';
|
||||
}
|
||||
}
|
||||
|
||||
void RdciDiscoverySubSystem::show_version() {
|
||||
rdc_component_version_t smiv;
|
||||
rdc_status_t result = rdc_device_get_component_version(rdc_handle_, RDC_AMDMSI_COMPONENT, &smiv);
|
||||
@@ -155,18 +305,21 @@ void RdciDiscoverySubSystem::show_version() {
|
||||
mixed_component_version_t rdcdv;
|
||||
uint32_t ret = get_mixed_component_version(rdc_handle_, RDCD_COMPONENT, &rdcdv);
|
||||
if (ret) {
|
||||
std::cout << "get rdcd version fail"<< std::endl;
|
||||
std::cout << "get rdcd version fail" << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
if (is_json_output()) {
|
||||
if (is_json_output()) {
|
||||
std::cout << "\"version\" : ";
|
||||
std::cout << '{';
|
||||
std::cout << "\"rdcd\": " << "\"" << rdcdv.version << "\", ";
|
||||
std::cout << "\"amdsmi_lib\": " << "\"" << smiv.version << "\"";
|
||||
std::cout << "\"rdcd\": "
|
||||
<< "\"" << rdcdv.version << "\", ";
|
||||
std::cout << "\"amdsmi_lib\": "
|
||||
<< "\"" << smiv.version << "\"";
|
||||
std::cout << '}';
|
||||
} else {
|
||||
std::cout << "RDCD : " << rdcdv.version << " | " << "AMDSMI Library : " << smiv.version << std::endl;
|
||||
std::cout << "RDCD : " << rdcdv.version << " | "
|
||||
<< "AMDSMI Library : " << smiv.version << std::endl;
|
||||
}
|
||||
|
||||
return;
|
||||
@@ -181,6 +334,10 @@ void RdciDiscoverySubSystem::process() {
|
||||
return show_attributes();
|
||||
}
|
||||
|
||||
if (is_partition_) {
|
||||
return show_attributes_with_partitions();
|
||||
}
|
||||
|
||||
if (show_version_) {
|
||||
return show_version();
|
||||
}
|
||||
|
||||
@@ -26,6 +26,8 @@ THE SOFTWARE.
|
||||
#include <signal.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <cmath>
|
||||
#include <ctime>
|
||||
#include <iomanip>
|
||||
@@ -62,6 +64,15 @@ void RdciDmonSubSystem::set_terminating(int sig) {
|
||||
}
|
||||
}
|
||||
|
||||
std::string entity_to_string(uint32_t entity_index) {
|
||||
rdc_entity_info_t info = rdc_get_info_from_entity_index(entity_index);
|
||||
|
||||
if (info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) {
|
||||
return "g" + std::to_string(info.device_index) + "." + std::to_string(info.instance_index);
|
||||
}
|
||||
return std::to_string(info.device_index);
|
||||
}
|
||||
|
||||
void RdciDmonSubSystem::parse_cmd_opts(int argc, char** argv) {
|
||||
const int HOST_OPTIONS = 1000;
|
||||
const int LIST_ALL_FIELDS_OPT = 1001;
|
||||
@@ -174,15 +185,6 @@ void RdciDmonSubSystem::parse_cmd_opts(int argc, char** argv) {
|
||||
if (gpu_indexes == "") {
|
||||
show_help();
|
||||
throw RdcException(RDC_ST_BAD_PARAMETER, "Need to specify the GPUs or group id");
|
||||
} else {
|
||||
std::vector<std::string> vec_ids = split_string(gpu_indexes, ',');
|
||||
for (uint32_t i = 0; i < vec_ids.size(); i++) {
|
||||
if (!IsNumber(vec_ids[i])) {
|
||||
throw RdcException(RDC_ST_BAD_PARAMETER,
|
||||
"The GPU index " + vec_ids[i] + " needs to be a number");
|
||||
}
|
||||
gpu_indexes_.push_back(std::stoi(vec_ids[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -207,6 +209,9 @@ void RdciDmonSubSystem::parse_cmd_opts(int argc, char** argv) {
|
||||
if (options_.find(OPTIONS_COUNT) == options_.end()) {
|
||||
options_.insert({OPTIONS_COUNT, std::numeric_limits<uint32_t>::max()});
|
||||
}
|
||||
|
||||
// Store gpu indexes to parse later
|
||||
raw_gpu_indexes_ = gpu_indexes;
|
||||
}
|
||||
|
||||
void RdciDmonSubSystem::show_help() const {
|
||||
@@ -272,8 +277,15 @@ void RdciDmonSubSystem::create_temp_group() {
|
||||
for (uint32_t i = 0; i < gpu_indexes_.size(); i++) {
|
||||
result = rdc_group_gpu_add(rdc_handle_, group_id, gpu_indexes_[i]);
|
||||
if (result != RDC_ST_OK) {
|
||||
throw RdcException(result,
|
||||
"Fail to add " + std::to_string(gpu_indexes_[i]) + " to the dmon group.");
|
||||
rdc_entity_info_t info = rdc_get_info_from_entity_index(gpu_indexes_[i]);
|
||||
std::string info_str;
|
||||
if (info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) {
|
||||
info_str =
|
||||
"g" + std::to_string(info.device_index) + "." + std::to_string(info.instance_index);
|
||||
} else {
|
||||
info_str = std::to_string(info.device_index);
|
||||
}
|
||||
throw RdcException(result, "Fail to add " + info_str + " to the dmon group.");
|
||||
}
|
||||
}
|
||||
options_.insert({OPTIONS_GROUP_ID, group_id});
|
||||
@@ -301,6 +313,73 @@ void RdciDmonSubSystem::create_temp_field_group() {
|
||||
options_.insert({OPTIONS_FIELD_GROUP_ID, group_id});
|
||||
}
|
||||
|
||||
void RdciDmonSubSystem::resolve_gpu_indexes() {
|
||||
uint32_t device_list[RDC_MAX_NUM_DEVICES];
|
||||
uint32_t count = 0;
|
||||
rdc_status_t res = rdc_device_get_all(rdc_handle_, device_list, &count);
|
||||
if (res != RDC_ST_OK) {
|
||||
throw RdcException(res, "Failed to get all devices");
|
||||
}
|
||||
|
||||
std::vector<std::string> vec_ids = split_string(raw_gpu_indexes_, ',');
|
||||
for (uint32_t i = 0; i < vec_ids.size(); i++) {
|
||||
if (rdc_is_partition_string(vec_ids[i].c_str())) {
|
||||
uint32_t logicalPhysicalGpu;
|
||||
uint32_t partition;
|
||||
if (!rdc_parse_partition_string(vec_ids[i].c_str(), &logicalPhysicalGpu, &partition)) {
|
||||
throw RdcException(RDC_ST_BAD_PARAMETER, "Invalid partition format: " + vec_ids[i]);
|
||||
}
|
||||
|
||||
if (logicalPhysicalGpu >= count) {
|
||||
throw RdcException(RDC_ST_BAD_PARAMETER,
|
||||
"GPU " + std::to_string(logicalPhysicalGpu) + " is out of range");
|
||||
}
|
||||
|
||||
uint32_t physicalGpu = device_list[logicalPhysicalGpu];
|
||||
|
||||
uint16_t num_partitions = 0;
|
||||
rdc_status_t st = rdc_get_num_partition(rdc_handle_, physicalGpu, &num_partitions);
|
||||
if (st != RDC_ST_OK) {
|
||||
throw RdcException(st,
|
||||
"Failed to get partition info for GPU " + std::to_string(physicalGpu));
|
||||
}
|
||||
|
||||
if (num_partitions == UINT16_MAX || num_partitions <= 1) {
|
||||
if (partition != 0) {
|
||||
throw RdcException(RDC_ST_BAD_PARAMETER, "GPU " + std::to_string(physicalGpu) +
|
||||
" is not partitioned, so partition " +
|
||||
std::to_string(partition) + " is invalid");
|
||||
}
|
||||
} else {
|
||||
if (partition >= num_partitions) {
|
||||
throw RdcException(RDC_ST_BAD_PARAMETER,
|
||||
"GPU " + std::to_string(physicalGpu) + " supports only " +
|
||||
std::to_string(num_partitions) + " partitions, partition " +
|
||||
std::to_string(partition) + " is invalid");
|
||||
}
|
||||
}
|
||||
|
||||
rdc_entity_info_t phys_info;
|
||||
phys_info.device_index = physicalGpu;
|
||||
phys_info.instance_index = partition;
|
||||
phys_info.entity_role = RDC_DEVICE_ROLE_PARTITION_INSTANCE;
|
||||
phys_info.device_type = RDC_DEVICE_TYPE_GPU;
|
||||
uint32_t phys_entity_index = rdc_get_entity_index_from_info(phys_info);
|
||||
gpu_indexes_.push_back(phys_entity_index);
|
||||
} else if (IsNumber(vec_ids[i])) {
|
||||
uint32_t logicalIndex = std::stoi(vec_ids[i]);
|
||||
if (logicalIndex >= count) {
|
||||
throw RdcException(RDC_ST_BAD_PARAMETER,
|
||||
"GPU " + std::to_string(logicalIndex) + " is out of range");
|
||||
}
|
||||
gpu_indexes_.push_back(std::stoi(vec_ids[i]));
|
||||
} else {
|
||||
throw RdcException(RDC_ST_BAD_PARAMETER, "The GPU index " + vec_ids[i] +
|
||||
" needs to be a number or a valid partition");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void RdciDmonSubSystem::show_field_usage() const {
|
||||
std::cout << "Supported fields Ids:" << std::endl;
|
||||
|
||||
@@ -430,6 +509,8 @@ void RdciDmonSubSystem::process() {
|
||||
rdc_group_info_t group_info;
|
||||
rdc_field_group_info_t field_info;
|
||||
|
||||
resolve_gpu_indexes();
|
||||
|
||||
// Create a temporary group/field if pass as GPU indexes or field ids
|
||||
create_temp_group();
|
||||
create_temp_field_group();
|
||||
@@ -516,7 +597,8 @@ void RdciDmonSubSystem::process() {
|
||||
print_and_clr_notif_pq(¬if_pq, show_timpstamps_);
|
||||
|
||||
for (uint32_t gindex = 0; gindex < group_info.count; gindex++) {
|
||||
std::cout << group_info.entity_ids[gindex] << "\t";
|
||||
std::cout << std::setw(12) << std::left << entity_to_string(group_info.entity_ids[gindex])
|
||||
<< "\t";
|
||||
for (uint32_t findex = 0; findex < reg_fields.size(); findex++) {
|
||||
rdc_field_value value;
|
||||
|
||||
|
||||
@@ -184,6 +184,14 @@ class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service {
|
||||
const ::rdc::ClearConfigRequest* request,
|
||||
::rdc::ClearConfigResponse* reply) override;
|
||||
|
||||
::grpc::Status GetNumPartition(::grpc::ServerContext* context,
|
||||
const ::rdc::GetNumPartitionRequest* request,
|
||||
::rdc::GetNumPartitionResponse* reply) override;
|
||||
|
||||
::grpc::Status GetInstanceProfile(::grpc::ServerContext* context,
|
||||
const ::rdc::GetInstanceProfileRequest* request,
|
||||
::rdc::GetInstanceProfileResponse* reply) override;
|
||||
|
||||
private:
|
||||
bool copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, ::rdc::GpuUsageInfo* target);
|
||||
rdc_handle_t rdc_handle_;
|
||||
|
||||
@@ -1071,7 +1071,7 @@ int RdcAPIServiceImpl::PolicyCallback(rdc_policy_callback_response_t* userData)
|
||||
static_cast<::rdc::TopologyLinkInfo_LinkType>(topology_results.link_infos[i].link_type));
|
||||
linkinfos->set_p2p_accessible(topology_results.link_infos[i].is_p2p_accessible);
|
||||
}
|
||||
return ::grpc::Status::OK;
|
||||
return ::grpc::Status::OK;
|
||||
}
|
||||
|
||||
::grpc::Status RdcAPIServiceImpl::SetConfig(::grpc::ServerContext* context,
|
||||
@@ -1140,13 +1140,56 @@ int RdcAPIServiceImpl::PolicyCallback(rdc_policy_callback_response_t* userData)
|
||||
gpulinkstatus->set_link_types(
|
||||
static_cast<::rdc::GpuLinkStatus_LinkTypes>(link_status_results.gpus[i].link_types));
|
||||
for (uint32_t n = 0; n < link_status_results.gpus[i].num_of_links; n++) {
|
||||
gpulinkstatus->add_link_states(static_cast<::rdc::GpuLinkStatus_LinkState>(
|
||||
link_status_results.gpus[i].link_states[n]));
|
||||
gpulinkstatus->add_link_states(
|
||||
static_cast<::rdc::GpuLinkStatus_LinkState>(link_status_results.gpus[i].link_states[n]));
|
||||
}
|
||||
}
|
||||
|
||||
return ::grpc::Status::OK;
|
||||
}
|
||||
|
||||
::grpc::Status RdcAPIServiceImpl::GetNumPartition(::grpc::ServerContext* context,
|
||||
const ::rdc::GetNumPartitionRequest* request,
|
||||
::rdc::GetNumPartitionResponse* reply) {
|
||||
(void)context;
|
||||
if (!request || !reply) {
|
||||
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty request or reply");
|
||||
}
|
||||
|
||||
uint32_t gpu_index = request->gpu_index();
|
||||
uint16_t num_partition = 0;
|
||||
rdc_status_t result = rdc_get_num_partition(rdc_handle_, gpu_index, &num_partition);
|
||||
reply->set_status(result);
|
||||
if (result == RDC_ST_OK) {
|
||||
reply->set_num_partition(num_partition);
|
||||
}
|
||||
return ::grpc::Status::OK;
|
||||
}
|
||||
|
||||
::grpc::Status RdcAPIServiceImpl::GetInstanceProfile(
|
||||
::grpc::ServerContext* context, const ::rdc::GetInstanceProfileRequest* request,
|
||||
::rdc::GetInstanceProfileResponse* reply) {
|
||||
(void)context;
|
||||
if (!request || !reply) {
|
||||
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty request or reply");
|
||||
}
|
||||
|
||||
uint32_t entity_index = request->entity_index();
|
||||
uint32_t resource_type = request->resource_type();
|
||||
rdc_resource_profile_t profile;
|
||||
memset(&profile, 0, sizeof(profile));
|
||||
|
||||
// Call the RDC API that (in embedded mode) uses AMD SMI
|
||||
rdc_status_t result =
|
||||
rdc_instance_profile_get(rdc_handle_, entity_index,
|
||||
static_cast<rdc_instance_resource_type_t>(resource_type), &profile);
|
||||
reply->set_status(result);
|
||||
if (result == RDC_ST_OK) {
|
||||
reply->set_partition_resource(profile.partition_resource);
|
||||
reply->set_num_partitions_share_resource(profile.num_partitions_share_resource);
|
||||
}
|
||||
return ::grpc::Status::OK;
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren