fix: [SWDEV-448201] [rocm/amd_smi_lib]

Adds Add PCIE Errors

Code changes related to the following:
  * amdsmi_get_pcie_info()
  * CLI
  * examples

Change-Id: Ie0b7053e77c88fb18309c16e74bce75d862c45a9
Signed-off-by: Oliveira, Daniel <daniel.oliveira@amd.com>
Этот коммит содержится в:
Oliveira, Daniel
2024-03-05 14:01:06 -06:00
коммит произвёл Guan Yu
родитель 06fa6580c4
Коммит 1310c767ce
7 изменённых файлов: 200 добавлений и 102 удалений
+39 -68
Просмотреть файл
@@ -361,11 +361,11 @@ class AMDSMICommands():
logging.debug("Failed to get bdf for gpu %s | %s", gpu_id, e.get_error_info())
try:
link_caps = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)
bus_info['max_pcie_width'] = link_caps['pcie_static']['max_pcie_width']
bus_info['max_pcie_speed'] = link_caps['pcie_static']['max_pcie_speed']
bus_info['pcie_interface_version'] = link_caps['pcie_static']['pcie_interface_version']
pcie_static = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_static']
bus_info['max_pcie_width'] = pcie_static['max_pcie_width']
bus_info['max_pcie_speed'] = pcie_static['max_pcie_speed']
bus_info['pcie_interface_version'] = pcie_static['pcie_interface_version']
bus_info['slot_type'] = pcie_static['slot_type']
if bus_info['max_pcie_speed'] % 1000 != 0:
pcie_speed_GTs_value = round(bus_info['max_pcie_speed'] / 1000, 1)
else:
@@ -373,14 +373,6 @@ class AMDSMICommands():
bus_info['max_pcie_speed'] = pcie_speed_GTs_value
slot_type = link_caps['pcie_static']['slot_type']
if isinstance(slot_type, int):
slot_types = amdsmi_interface.amdsmi_wrapper.amdsmi_card_form_factor_t__enumvalues
if slot_type in slot_types:
bus_info['slot_type'] = slot_types[slot_type].replace("AMDSMI_CARD_FORM_FACTOR_", "")
else:
bus_info['slot_type'] = "Unknown"
if bus_info['pcie_interface_version'] > 0:
bus_info['pcie_interface_version'] = f"Gen {bus_info['pcie_interface_version']}"
@@ -636,7 +628,7 @@ class AMDSMICommands():
except amdsmi_exception.AmdSmiLibraryException as e:
policy_info = "N/A"
logging.debug("Failed to get policy info for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['dpm_policy'] = policy_info
if 'numa' in current_platform_args:
if args.numa:
@@ -1460,6 +1452,7 @@ class AMDSMICommands():
if args.pcie:
pcie_dict = {"width": "N/A",
"speed": "N/A",
"bandwidth": "N/A",
"replay_count" : "N/A",
"l0_to_recovery_count" : "N/A",
"replay_roll_over_count" : "N/A",
@@ -1470,65 +1463,43 @@ class AMDSMICommands():
"max_packet_size": "N/A"}
try:
pcie_link_status = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)
pcie_metric = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric']
logging.debug("PCIE Metric for %s | %s", gpu_id, pcie_metric)
if pcie_link_status['pcie_metric']['pcie_speed'] % 1000 != 0:
pcie_speed_GTs_value = round(pcie_link_status['pcie_metric']['pcie_speed'] / 1000, 1)
else:
pcie_speed_GTs_value = round(pcie_link_status['pcie_metric']['pcie_speed'] / 1000)
pcie_dict['width'] = pcie_metric['pcie_width']
pcie_dict['width'] = pcie_link_status['pcie_metric']['pcie_width']
pcie_dict['speed'] = pcie_speed_GTs_value
if pcie_metric['pcie_speed'] != "N/A":
if pcie_metric['pcie_speed'] % 1000 != 0:
pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000, 1)
else:
pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000)
pcie_dict['speed'] = pcie_speed_GTs_value
pcie_dict['bandwidth'] = pcie_metric['pcie_bandwidth']
pcie_dict['replay_count'] = pcie_metric['pcie_replay_count']
pcie_dict['l0_to_recovery_count'] = pcie_metric['pcie_l0_to_recovery_count']
pcie_dict['replay_roll_over_count'] = pcie_metric['pcie_replay_roll_over_count']
pcie_dict['nak_received_count'] = pcie_metric['pcie_nak_received_count']
pcie_dict['nak_sent_count'] = pcie_metric['pcie_nak_sent_count']
pcie_speed_unit = 'GT/s'
pcie_bw_unit = 'Mb/s'
if self.logger.is_human_readable_format():
pcie_dict['speed'] = f"{pcie_dict['speed']} {pcie_speed_unit}"
if pcie_dict['speed'] != "N/A":
pcie_dict['speed'] = f"{pcie_dict['speed']} {pcie_speed_unit}"
if pcie_dict['bandwidth'] != "N/A":
pcie_dict['bandwidth'] = f"{pcie_dict['bandwidth']} {pcie_bw_unit}"
if self.logger.is_json_format():
pcie_dict['speed'] = {"value" : pcie_dict['speed'],
"unit" : pcie_speed_unit}
if pcie_dict['speed'] != "N/A":
pcie_dict['speed'] = {"value" : pcie_dict['speed'],
"unit" : pcie_speed_unit}
if pcie_dict['bandwidth'] != "N/A":
pcie_dict['bandwidth'] = {"value" : pcie_dict['bandwidth'],
"unit" : pcie_bw_unit}
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get pcie link status for gpu %s | %s", gpu_id, e.get_error_info())
try:
pci_replay_counter = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['pcie_replay_count_acc']
if pci_replay_counter == "N/A":
# raising exception here to fall back to sysfs
raise amdsmi_exception.AmdSmiLibraryException(amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED)
pcie_dict['replay_count'] = pci_replay_counter
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get pci replay counter for gpu %s | %s", gpu_id, e.get_error_info())
logging.debug("Falling back to sysfs pci replay counter for gpu %s | %s", gpu_id, e.get_error_info())
try:
pci_replay_counter = amdsmi_interface.amdsmi_get_gpu_pci_replay_counter(args.gpu)
pcie_dict['replay_count'] = pci_replay_counter
except amdsmi_exception.AmdSmiLibraryException as err:
pcie_dict['replay_count'] = "N/A"
logging.debug("Failed to get sysfs fallback pci replay counter for gpu %s | %s", gpu_id, err.get_error_info())
try:
l0_to_recovery_counter = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['pcie_l0_to_recov_count_acc']
pcie_dict['l0_to_recovery_count'] = l0_to_recovery_counter
except amdsmi_exception.AmdSmiLibraryException as e:
pcie_dict['l0_to_recovery_count'] = "N/A"
logging.debug("Failed to get pcie l0 to recovery counter for gpu %s | %s", gpu_id, e.get_error_info())
try:
pci_replay_rollover_counter = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['pcie_replay_rover_count_acc']
pcie_dict['replay_roll_over_count'] = pci_replay_rollover_counter
except amdsmi_exception.AmdSmiLibraryException as e:
pcie_dict['replay_roll_over_count'] = "N/A"
logging.debug("Failed to get pcie replay rollover counter for gpu %s | %s", gpu_id, e.get_error_info())
try:
gpu_metrics_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
pcie_dict['nak_sent_count'] = gpu_metrics_info['pcie_nak_sent_count_acc']
pcie_dict['nak_received_count'] = gpu_metrics_info['pcie_nak_rcvd_count_acc']
except amdsmi_exception.AmdSmiLibraryException as e:
pcie_dict['nak_sent_count'] = "N/A"
pcie_dict['nak_received_count'] = "N/A"
logging.debug("Failed to get pcie nak info for gpu %s | %s", gpu_id, e.get_error_info())
try:
pcie_bw = amdsmi_interface.amdsmi_get_gpu_pci_throughput(args.gpu)
sent = pcie_bw['sent'] * pcie_bw['max_pkt_sz']
@@ -4134,14 +4105,14 @@ class AMDSMICommands():
}
try:
pcie_info = amdsmi_interface.amdsmi_get_pcie_info(src_gpu)['pcie_static']
if pcie_info['max_pcie_speed'] % 1000 != 0:
pcie_speed_GTs_value = round(pcie_info['max_pcie_speed'] / 1000, 1)
pcie_static = amdsmi_interface.amdsmi_get_pcie_info(src_gpu)['pcie_static']
if pcie_static['max_pcie_speed'] % 1000 != 0:
pcie_speed_GTs_value = round(pcie_static['max_pcie_speed'] / 1000, 1)
else:
pcie_speed_GTs_value = round(pcie_info['max_pcie_speed'] / 1000)
pcie_speed_GTs_value = round(pcie_static['max_pcie_speed'] / 1000)
bitrate = pcie_speed_GTs_value
max_bandwidth = bitrate * pcie_info['max_pcie_width']
max_bandwidth = bitrate * pcie_static['max_pcie_width']
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get bitrate and bandwidth for GPU %s | %s", src_gpu_id,
e.get_error_info())
+8
Просмотреть файл
@@ -411,6 +411,14 @@ int main() {
printf("\tPCIe max lanes: %d\n", pcie_info.pcie_static.max_pcie_width);
printf("\tPCIe max speed: %d\n", pcie_info.pcie_static.max_pcie_speed);
// additional pcie related metrics
printf("\tPCIe bandwidth: %d\n", pcie_info.pcie_metric.pcie_bandwidth);
printf("\tPCIe replay count: %d\n", pcie_info.pcie_metric.pcie_replay_count);
printf("\tPCIe L0 recovery count: %d\n", pcie_info.pcie_metric.pcie_l0_to_recovery_count);
printf("\tPCIe rollover count: %d\n", pcie_info.pcie_metric.pcie_replay_roll_over_count);
printf("\tPCIe nak received count: %d\n", pcie_info.pcie_metric.pcie_nak_received_count);
printf("\tPCIe nak sent count: %d\n", pcie_info.pcie_metric.pcie_nak_sent_count);
// Get VRAM temperature limit
int64_t temperature = 0;
ret = amdsmi_get_temp_metric(
+1 -1
Просмотреть файл
@@ -509,7 +509,7 @@ typedef struct {
struct pcie_metric_ {
uint16_t pcie_width; //!< current PCIe width
uint32_t pcie_speed; //!< current PCIe speed in MT/s
uint32_t pcie_bandwidth; //!< current PCIe bandwidth Mb/s
uint32_t pcie_bandwidth; //!< current instantaneous PCIe bandwidth in Mb/s
uint64_t pcie_replay_count; //!< total number of the replays issued on the PCIe link
uint64_t pcie_l0_to_recovery_count; //!< total number of times the PCIe link transitioned from L0 to the recovery state
uint64_t pcie_replay_roll_over_count; //!< total number of replay rollovers issued on the PCIe link
+54
Просмотреть файл
@@ -21,6 +21,9 @@
#ifndef AMD_SMI_INCLUDE_AMD_SMI_UTILS_H_
#define AMD_SMI_INCLUDE_AMD_SMI_UTILS_H_
#include <limits>
#include <type_traits>
#include "amd_smi/amdsmi.h"
#include "amd_smi/impl/amd_smi_gpu_device.h"
#include "rocm_smi/rocm_smi_utils.h"
@@ -45,4 +48,55 @@ amdsmi_status_t smi_amdgpu_get_pcie_speed_from_pcie_type(uint16_t pcie_type, uin
amdsmi_status_t smi_amdgpu_get_market_name_from_dev_id(uint32_t device_id, char *market_name);
amdsmi_status_t smi_amdgpu_is_gpu_power_management_enabled(amd::smi::AMDSmiGPUDevice* device, bool *enabled);
template<typename>
constexpr bool is_dependent_false_v = false;
template<typename T>
inline constexpr bool is_supported_type_v = (
std::is_same_v<std::remove_cv_t<std::remove_reference_t<T>>, std::uint8_t> ||
std::is_same_v<std::remove_cv_t<std::remove_reference_t<T>>, std::uint16_t> ||
std::is_same_v<std::remove_cv_t<std::remove_reference_t<T>>, std::uint32_t> ||
std::is_same_v<std::remove_cv_t<std::remove_reference_t<T>>, std::uint64_t>
);
template<typename T>
constexpr T get_std_num_limit()
{
if constexpr (is_supported_type_v<T>) {
return std::numeric_limits<T>::max();
}
else {
return std::numeric_limits<T>::min();
static_assert(is_dependent_false_v<T>, "Error: Type not supported...");
}
}
template<typename T>
constexpr bool is_std_num_limit(T value)
{
return (value == get_std_num_limit<T>());
}
template<typename T, typename U, typename V = T>
constexpr T translate_umax_or_assign_value(U source_value, V target_value)
{
T result{};
if constexpr (is_supported_type_v<T> && is_supported_type_v<U>) {
// If the source value is uint<U>::max(), then return is uint<T>::max()
if (is_std_num_limit(source_value)) {
result = get_std_num_limit<T>();
} else {
result = static_cast<T>(target_value);
}
return result;
}
else {
static_assert(is_dependent_false_v<T>, "Error: Type not supported...");
}
return result;
}
#endif //
+28 -30
Просмотреть файл
@@ -580,7 +580,7 @@ Output: Dictionary with fields
Field | Description
---|---
`fw_list`| List of dictionaries that contain information about a certain firmware block
`fw_list` | List of dictionaries that contain information about a certain firmware block
Exceptions that can be thrown by `amdsmi_get_fw_info` function:
@@ -619,7 +619,7 @@ Output: Dictionary of activites to their respective usage percentage or 'N/A' if
Field | Description
---|---
`gfx_activity`| graphics engine usage percentage (0 - 100)
`gfx_activity` | graphics engine usage percentage (0 - 100)
`umc_activity` | memory engine usage percentage (0 - 100)
`mm_activity` | average multimedia engine usages in percentage (0 - 100)
@@ -659,7 +659,7 @@ Output: Dictionary with fields
Field | Description
---|---
`average_socket_power`| average socket power
`average_socket_power` | average socket power
`gfx_voltage` | voltage gfx
`power_limit` | power limit
@@ -699,7 +699,7 @@ Output: Dictionary with fields
Field | Description
---|---
`vram_total` | VRAM total
`vram_used`| VRAM currently in use
`vram_used` | VRAM currently in use
Exceptions that can be thrown by `amdsmi_get_gpu_vram_usage` function:
@@ -751,7 +751,7 @@ Output: Dictionary with fields
Field | Description
---|---
`cur_clk`| Current clock for given clock type
`cur_clk` | Current clock for given clock type
`max_clk` | Maximum clock for given clock type
`min_clk` | Minimum clock for given clock type
@@ -780,20 +780,19 @@ except AmdSmiException as e:
### amdsmi_get_pcie_info
Description: Returns the pcie link status for the given GPU.
Description: Returns the pcie metric and static information for the given GPU.
It is not supported on virtual machine guest
Input parameters:
* `processor_handle` device which to query
Output: Dictionary with fields
Output: Dictionary with 2 fields `pcie_static` and `pcie_metric`
Field | Description
Fields | Description
---|---
`pcie_width`| pcie lanes in use
`pcie_speed`| current pcie speed
`pcie_interface_version`| current pcie generation
`pcie_static` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`max_pcie_width`</td><td>Maximum number of pcie lanes available</td></tr><tr><td>`max_pcie_speed`</td><td>Maximum capable pcie speed in GT/s</td></tr><tr><td>`pcie_interface_version`</td><td>PCIe generation ie. 3,4,5...</td></tr><tr><td>`slot_type`</td><td>The type of form factor of the slot: PCIE, OAM, or Unknown</td></tr></tbody></table>
`pcie_metric` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`pcie_width`</td><td>Current number of pcie lanes available</td></tr><tr><td>`pcie_speed`</td><td>Current pcie speed capable in GT/s</td></tr><tr><td>`pcie_bandwidth`</td><td>Current instantaneous bandwidth usage in Mb/s</td></tr><tr><td>`pcie_replay_count`</td><td>Total number of PCIe replays (NAKs)</td></tr><tr><td>`pcie_l0_to_recovery_count`</td><td>PCIE L0 to recovery state transition accumulated count</td></tr><tr><td>`pcie_replay_roll_over_count`</td><td>PCIe Replay accumulated count</td></tr><tr><td>`pcie_nak_sent_count`</td><td>PCIe NAK sent accumulated count</td></tr><tr><td>`pcie_nak_received_count`</td><td>PCIe NAK received accumulated count</td></tr></tbody></table>
Exceptions that can be thrown by `amdsmi_get_pcie_info` function:
@@ -810,10 +809,9 @@ try:
print("No GPUs on machine")
else:
for device in devices:
pcie_link_status = amdsmi_get_pcie_info(device)
print(pcie_link_status["pcie_width"])
print(pcie_link_status["pcie_speed"])
print(pcie_link_status["pcie_interface_version"])
pcie_info = amdsmi_get_pcie_info(device)
print(pcie_info["pcie_static"])
print(pcie_info["pcie_metric"])
except AmdSmiException as e:
print(e)
```
@@ -949,8 +947,8 @@ Output: Dictionary with fields
Field | Description
---|---
`correctable_count`| Correctable ECC error count
`uncorrectable_count`| Uncorrectable ECC error count
`correctable_count` | Correctable ECC error count
`uncorrectable_count` | Uncorrectable ECC error count
Exceptions that can be thrown by `amdsmi_get_gpu_total_ecc_count` function:
@@ -2021,9 +2019,9 @@ Output: Dictionary with fields
Field | Description
---|---
`num_supported`| The number of supported frequencies
`current`| The current frequency index
`frequency`| List of frequencies, only the first num_supported frequencies are valid
`num_supported` | The number of supported frequencies
`current` | The current frequency index
`frequency` | List of frequencies, only the first num_supported frequencies are valid
Exceptions that can be thrown by `amdsmi_get_clk_freq` function:
@@ -2062,8 +2060,8 @@ Field | Description
`curr_mclk_range` | <table> <thead><tr><th> Subfield </th><th>Description</th></tr></thead><tbody><tr><td>`lower_bound`</td><td>lower bound mclk range</td></tr><tr><td>`upper_bound`</td><td>upper bound mclk range</td></tr></tbody></table>
`sclk_freq_limits` | <table> <thead><tr><th> Subfield </th><th>Description</th></tr></thead><tbody><tr><td>`lower_bound`</td><td>lower bound sclk range limt</td></tr><tr><td>`upper_bound`</td><td>upper bound sclk range limit</td></tr></tbody></table>
`mclk_freq_limits` | <table> <thead><tr><th> Subfield </th><th>Description</th></tr></thead><tbody><tr><td>`lower_bound`</td><td>lower bound mclk range limit</td></tr><tr><td>`upper_bound`</td><td>upper bound mclk range limit</td></tr></tbody></table>
`curve.vc_points`| The number of supported frequencies
`num_regions`| The current frequency index
`curve.vc_points` | The number of supported frequencies
`num_regions` | The current frequency index
Exceptions that can be thrown by `amdsmi_get_gpu_od_volt_info` function:
@@ -2228,9 +2226,9 @@ Output: Dictionary with fields
Field | Description
---|---
`available_profiles`| Which profiles are supported by this system
`current`| Which power profile is currently active
`num_profiles`| How many power profiles are available
`available_profiles` | Which profiles are supported by this system
`current` | Which power profile is currently active
`num_profiles` | How many power profiles are available
Exceptions that can be thrown by `amdsmi_get_gpu_power_profile_presets` function:
@@ -2391,9 +2389,9 @@ Output: Dictionary with fields
Field | Description
---|---
`value`| Counter value
`time_enabled`| Time that the counter was enabled in nanoseconds
`time_running`| Time that the counter was running in nanoseconds
`value` | Counter value
`time_enabled` | Time that the counter was enabled in nanoseconds
`time_running` | Time that the counter was running in nanoseconds
Exceptions that can be thrown by `amdsmi_gpu_read_counter` function:
@@ -2661,8 +2659,8 @@ Output: Dict containing information about error counts
Field | Description
---|---
`correctable_count`| Count of correctable errors
`uncorrectable_count`| Count of uncorrectable errors
`correctable_count` | Count of correctable errors
`uncorrectable_count` | Count of uncorrectable errors
Exceptions that can be thrown by `amdsmi_get_gpu_ecc_count` function:
+45 -2
Просмотреть файл
@@ -2134,7 +2134,7 @@ def amdsmi_get_pcie_info(
)
)
return {
pcie_info_dict = {
"pcie_static": {
"max_pcie_width": pcie_info.pcie_static.max_pcie_width,
"max_pcie_speed": pcie_info.pcie_static.max_pcie_speed,
@@ -2153,6 +2153,49 @@ def amdsmi_get_pcie_info(
}
}
# Check pcie static values for uint max
if pcie_info_dict['pcie_static']['max_pcie_width'] == 0xFFFF:
pcie_info_dict['pcie_static']['max_pcie_width'] = "N/A"
if pcie_info_dict['pcie_static']['max_pcie_speed'] == 0xFFFFFFFF:
pcie_info_dict['pcie_static']['max_pcie_speed'] = "N/A"
if pcie_info_dict['pcie_static']['pcie_interface_version'] == 0xFFFFFFFF:
pcie_info_dict['pcie_static']['pcie_interface_version'] = "N/A"
slot_type = pcie_info_dict['pcie_static']['slot_type']
if isinstance(slot_type, int):
slot_types = amdsmi_wrapper.amdsmi_card_form_factor_t__enumvalues
if slot_type in slot_types:
pcie_info_dict['pcie_static']['slot_type'] = slot_types[slot_type].replace("AMDSMI_CARD_FORM_FACTOR_", "")
else:
pcie_info_dict['pcie_static']['slot_type'] = "Unknown"
else:
pcie_info_dict['pcie_static']['slot_type'] = "N/A"
# Check pcie metric values for uint max
if pcie_info_dict['pcie_metric']['pcie_width'] == 0xFFFF:
pcie_info_dict['pcie_metric']['pcie_width'] = "N/A"
if pcie_info_dict['pcie_metric']['pcie_speed'] == 0xFFFFFFFF:
pcie_info_dict['pcie_metric']['pcie_speed'] = "N/A"
if pcie_info_dict['pcie_metric']['pcie_bandwidth'] == 0xFFFFFFFF:
pcie_info_dict['pcie_metric']['pcie_bandwidth'] = "N/A"
# TODO Just Navi 21 has a different uint max size for pcie_bandwidth
# if pcie_info_dict['pcie_metric']['pcie_bandwidth'] == 0xFFFFFFFF:
# pcie_info_dict['pcie_metric']['pcie_bandwidth'] = "N/A"
if pcie_info_dict['pcie_metric']['pcie_replay_count'] == 0xFFFFFFFFFFFFFFFF:
pcie_info_dict['pcie_metric']['pcie_replay_count'] = "N/A"
if pcie_info_dict['pcie_metric']['pcie_l0_to_recovery_count'] == 0xFFFFFFFFFFFFFFFF:
pcie_info_dict['pcie_metric']['pcie_l0_to_recovery_count'] = "N/A"
if pcie_info_dict['pcie_metric']['pcie_replay_roll_over_count'] == 0xFFFFFFFFFFFFFFFF:
pcie_info_dict['pcie_metric']['pcie_replay_roll_over_count'] = "N/A"
if pcie_info_dict['pcie_metric']['pcie_nak_sent_count'] == 0xFFFFFFFFFFFFFFFF:
pcie_info_dict['pcie_metric']['pcie_nak_sent_count'] = "N/A"
if pcie_info_dict['pcie_metric']['pcie_nak_received_count'] == 0xFFFFFFFFFFFFFFFF:
pcie_info_dict['pcie_metric']['pcie_nak_received_count'] = "N/A"
return pcie_info_dict
def amdsmi_get_processor_handle_from_bdf(bdf):
bdf = _parse_bdf(bdf)
@@ -3275,7 +3318,7 @@ def amdsmi_get_dpm_policy(
processor_handle, ctypes.byref(policy)
)
)
polices = []
for i in range(0, policy.num_supported):
id = policy.policies[i].policy_id
+25 -1
Просмотреть файл
@@ -2052,9 +2052,33 @@ amdsmi_status_t amdsmi_get_pcie_info(amdsmi_processor_handle processor_handle, a
status = smi_amdgpu_get_pcie_speed_from_pcie_type(metric_info.pcie_link_speed, &info->pcie_metric.pcie_speed); // mapping to MT/s
} else {
// gpu metrics returns pcie link speed in .1 GT/s ex. 160 vs 16
info->pcie_metric.pcie_speed = metric_info.pcie_link_speed * 100;
info->pcie_metric.pcie_speed = translate_umax_or_assign_value<decltype(info->pcie_metric.pcie_speed)>
(metric_info.pcie_link_speed, (metric_info.pcie_link_speed * 100));
}
// additional pcie related metrics
/**
* pcie_metric.pcie_bandwidth: MB/s (uint32_t)
* metric_info.pcie_bandwidth_inst: GB/s (uint64_t)
*/
info->pcie_metric.pcie_bandwidth = translate_umax_or_assign_value<decltype(info->pcie_metric.pcie_bandwidth)>
(metric_info.pcie_bandwidth_inst, metric_info.pcie_bandwidth_inst);
info->pcie_metric.pcie_replay_count = metric_info.pcie_replay_count_acc;
info->pcie_metric.pcie_l0_to_recovery_count = metric_info.pcie_l0_to_recov_count_acc;
info->pcie_metric.pcie_replay_roll_over_count = metric_info.pcie_replay_rover_count_acc;
/**
* pcie_metric.pcie_nak_received_count: (uint64_t)
* metric_info.pcie_nak_rcvd_count_acc: (uint32_t)
*/
info->pcie_metric.pcie_nak_received_count = translate_umax_or_assign_value<decltype(info->pcie_metric.pcie_nak_received_count)>
(metric_info.pcie_nak_rcvd_count_acc, (metric_info.pcie_nak_rcvd_count_acc));
/**
* pcie_metric.pcie_nak_sent_count: (uint64_t)
* metric_info.pcie_nak_sent_count_acc: (uint32_t)
*/
info->pcie_metric.pcie_nak_sent_count = translate_umax_or_assign_value<decltype(info->pcie_metric.pcie_nak_sent_count)>
(metric_info.pcie_nak_sent_count_acc, (metric_info.pcie_nak_sent_count_acc));
return AMDSMI_STATUS_SUCCESS;
}