fix: [SWDEV-448201] [rocm/amd_smi_lib]
Adds Add PCIE Errors Code changes related to the following: * amdsmi_get_pcie_info() * CLI * examples Change-Id: Ie0b7053e77c88fb18309c16e74bce75d862c45a9 Signed-off-by: Oliveira, Daniel <daniel.oliveira@amd.com>
Этот коммит содержится в:
коммит произвёл
Guan Yu
родитель
06fa6580c4
Коммит
1310c767ce
@@ -361,11 +361,11 @@ class AMDSMICommands():
|
||||
logging.debug("Failed to get bdf for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
link_caps = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)
|
||||
bus_info['max_pcie_width'] = link_caps['pcie_static']['max_pcie_width']
|
||||
bus_info['max_pcie_speed'] = link_caps['pcie_static']['max_pcie_speed']
|
||||
bus_info['pcie_interface_version'] = link_caps['pcie_static']['pcie_interface_version']
|
||||
|
||||
pcie_static = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_static']
|
||||
bus_info['max_pcie_width'] = pcie_static['max_pcie_width']
|
||||
bus_info['max_pcie_speed'] = pcie_static['max_pcie_speed']
|
||||
bus_info['pcie_interface_version'] = pcie_static['pcie_interface_version']
|
||||
bus_info['slot_type'] = pcie_static['slot_type']
|
||||
if bus_info['max_pcie_speed'] % 1000 != 0:
|
||||
pcie_speed_GTs_value = round(bus_info['max_pcie_speed'] / 1000, 1)
|
||||
else:
|
||||
@@ -373,14 +373,6 @@ class AMDSMICommands():
|
||||
|
||||
bus_info['max_pcie_speed'] = pcie_speed_GTs_value
|
||||
|
||||
slot_type = link_caps['pcie_static']['slot_type']
|
||||
if isinstance(slot_type, int):
|
||||
slot_types = amdsmi_interface.amdsmi_wrapper.amdsmi_card_form_factor_t__enumvalues
|
||||
if slot_type in slot_types:
|
||||
bus_info['slot_type'] = slot_types[slot_type].replace("AMDSMI_CARD_FORM_FACTOR_", "")
|
||||
else:
|
||||
bus_info['slot_type'] = "Unknown"
|
||||
|
||||
if bus_info['pcie_interface_version'] > 0:
|
||||
bus_info['pcie_interface_version'] = f"Gen {bus_info['pcie_interface_version']}"
|
||||
|
||||
@@ -636,7 +628,7 @@ class AMDSMICommands():
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
policy_info = "N/A"
|
||||
logging.debug("Failed to get policy info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
|
||||
static_dict['dpm_policy'] = policy_info
|
||||
if 'numa' in current_platform_args:
|
||||
if args.numa:
|
||||
@@ -1460,6 +1452,7 @@ class AMDSMICommands():
|
||||
if args.pcie:
|
||||
pcie_dict = {"width": "N/A",
|
||||
"speed": "N/A",
|
||||
"bandwidth": "N/A",
|
||||
"replay_count" : "N/A",
|
||||
"l0_to_recovery_count" : "N/A",
|
||||
"replay_roll_over_count" : "N/A",
|
||||
@@ -1470,65 +1463,43 @@ class AMDSMICommands():
|
||||
"max_packet_size": "N/A"}
|
||||
|
||||
try:
|
||||
pcie_link_status = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)
|
||||
pcie_metric = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric']
|
||||
logging.debug("PCIE Metric for %s | %s", gpu_id, pcie_metric)
|
||||
|
||||
if pcie_link_status['pcie_metric']['pcie_speed'] % 1000 != 0:
|
||||
pcie_speed_GTs_value = round(pcie_link_status['pcie_metric']['pcie_speed'] / 1000, 1)
|
||||
else:
|
||||
pcie_speed_GTs_value = round(pcie_link_status['pcie_metric']['pcie_speed'] / 1000)
|
||||
pcie_dict['width'] = pcie_metric['pcie_width']
|
||||
|
||||
pcie_dict['width'] = pcie_link_status['pcie_metric']['pcie_width']
|
||||
pcie_dict['speed'] = pcie_speed_GTs_value
|
||||
if pcie_metric['pcie_speed'] != "N/A":
|
||||
if pcie_metric['pcie_speed'] % 1000 != 0:
|
||||
pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000, 1)
|
||||
else:
|
||||
pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000)
|
||||
pcie_dict['speed'] = pcie_speed_GTs_value
|
||||
|
||||
pcie_dict['bandwidth'] = pcie_metric['pcie_bandwidth']
|
||||
pcie_dict['replay_count'] = pcie_metric['pcie_replay_count']
|
||||
pcie_dict['l0_to_recovery_count'] = pcie_metric['pcie_l0_to_recovery_count']
|
||||
pcie_dict['replay_roll_over_count'] = pcie_metric['pcie_replay_roll_over_count']
|
||||
pcie_dict['nak_received_count'] = pcie_metric['pcie_nak_received_count']
|
||||
pcie_dict['nak_sent_count'] = pcie_metric['pcie_nak_sent_count']
|
||||
|
||||
pcie_speed_unit = 'GT/s'
|
||||
pcie_bw_unit = 'Mb/s'
|
||||
if self.logger.is_human_readable_format():
|
||||
pcie_dict['speed'] = f"{pcie_dict['speed']} {pcie_speed_unit}"
|
||||
if pcie_dict['speed'] != "N/A":
|
||||
pcie_dict['speed'] = f"{pcie_dict['speed']} {pcie_speed_unit}"
|
||||
if pcie_dict['bandwidth'] != "N/A":
|
||||
pcie_dict['bandwidth'] = f"{pcie_dict['bandwidth']} {pcie_bw_unit}"
|
||||
if self.logger.is_json_format():
|
||||
pcie_dict['speed'] = {"value" : pcie_dict['speed'],
|
||||
"unit" : pcie_speed_unit}
|
||||
if pcie_dict['speed'] != "N/A":
|
||||
pcie_dict['speed'] = {"value" : pcie_dict['speed'],
|
||||
"unit" : pcie_speed_unit}
|
||||
if pcie_dict['bandwidth'] != "N/A":
|
||||
pcie_dict['bandwidth'] = {"value" : pcie_dict['bandwidth'],
|
||||
"unit" : pcie_bw_unit}
|
||||
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get pcie link status for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
pci_replay_counter = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['pcie_replay_count_acc']
|
||||
if pci_replay_counter == "N/A":
|
||||
# raising exception here to fall back to sysfs
|
||||
raise amdsmi_exception.AmdSmiLibraryException(amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED)
|
||||
pcie_dict['replay_count'] = pci_replay_counter
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get pci replay counter for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
logging.debug("Falling back to sysfs pci replay counter for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
try:
|
||||
pci_replay_counter = amdsmi_interface.amdsmi_get_gpu_pci_replay_counter(args.gpu)
|
||||
pcie_dict['replay_count'] = pci_replay_counter
|
||||
except amdsmi_exception.AmdSmiLibraryException as err:
|
||||
pcie_dict['replay_count'] = "N/A"
|
||||
logging.debug("Failed to get sysfs fallback pci replay counter for gpu %s | %s", gpu_id, err.get_error_info())
|
||||
|
||||
try:
|
||||
l0_to_recovery_counter = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['pcie_l0_to_recov_count_acc']
|
||||
pcie_dict['l0_to_recovery_count'] = l0_to_recovery_counter
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
pcie_dict['l0_to_recovery_count'] = "N/A"
|
||||
logging.debug("Failed to get pcie l0 to recovery counter for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
pci_replay_rollover_counter = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['pcie_replay_rover_count_acc']
|
||||
pcie_dict['replay_roll_over_count'] = pci_replay_rollover_counter
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
pcie_dict['replay_roll_over_count'] = "N/A"
|
||||
logging.debug("Failed to get pcie replay rollover counter for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
gpu_metrics_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
|
||||
pcie_dict['nak_sent_count'] = gpu_metrics_info['pcie_nak_sent_count_acc']
|
||||
pcie_dict['nak_received_count'] = gpu_metrics_info['pcie_nak_rcvd_count_acc']
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
pcie_dict['nak_sent_count'] = "N/A"
|
||||
pcie_dict['nak_received_count'] = "N/A"
|
||||
logging.debug("Failed to get pcie nak info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
pcie_bw = amdsmi_interface.amdsmi_get_gpu_pci_throughput(args.gpu)
|
||||
sent = pcie_bw['sent'] * pcie_bw['max_pkt_sz']
|
||||
@@ -4134,14 +4105,14 @@ class AMDSMICommands():
|
||||
}
|
||||
|
||||
try:
|
||||
pcie_info = amdsmi_interface.amdsmi_get_pcie_info(src_gpu)['pcie_static']
|
||||
if pcie_info['max_pcie_speed'] % 1000 != 0:
|
||||
pcie_speed_GTs_value = round(pcie_info['max_pcie_speed'] / 1000, 1)
|
||||
pcie_static = amdsmi_interface.amdsmi_get_pcie_info(src_gpu)['pcie_static']
|
||||
if pcie_static['max_pcie_speed'] % 1000 != 0:
|
||||
pcie_speed_GTs_value = round(pcie_static['max_pcie_speed'] / 1000, 1)
|
||||
else:
|
||||
pcie_speed_GTs_value = round(pcie_info['max_pcie_speed'] / 1000)
|
||||
pcie_speed_GTs_value = round(pcie_static['max_pcie_speed'] / 1000)
|
||||
|
||||
bitrate = pcie_speed_GTs_value
|
||||
max_bandwidth = bitrate * pcie_info['max_pcie_width']
|
||||
max_bandwidth = bitrate * pcie_static['max_pcie_width']
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get bitrate and bandwidth for GPU %s | %s", src_gpu_id,
|
||||
e.get_error_info())
|
||||
|
||||
@@ -411,6 +411,14 @@ int main() {
|
||||
printf("\tPCIe max lanes: %d\n", pcie_info.pcie_static.max_pcie_width);
|
||||
printf("\tPCIe max speed: %d\n", pcie_info.pcie_static.max_pcie_speed);
|
||||
|
||||
// additional pcie related metrics
|
||||
printf("\tPCIe bandwidth: %d\n", pcie_info.pcie_metric.pcie_bandwidth);
|
||||
printf("\tPCIe replay count: %d\n", pcie_info.pcie_metric.pcie_replay_count);
|
||||
printf("\tPCIe L0 recovery count: %d\n", pcie_info.pcie_metric.pcie_l0_to_recovery_count);
|
||||
printf("\tPCIe rollover count: %d\n", pcie_info.pcie_metric.pcie_replay_roll_over_count);
|
||||
printf("\tPCIe nak received count: %d\n", pcie_info.pcie_metric.pcie_nak_received_count);
|
||||
printf("\tPCIe nak sent count: %d\n", pcie_info.pcie_metric.pcie_nak_sent_count);
|
||||
|
||||
// Get VRAM temperature limit
|
||||
int64_t temperature = 0;
|
||||
ret = amdsmi_get_temp_metric(
|
||||
|
||||
@@ -509,7 +509,7 @@ typedef struct {
|
||||
struct pcie_metric_ {
|
||||
uint16_t pcie_width; //!< current PCIe width
|
||||
uint32_t pcie_speed; //!< current PCIe speed in MT/s
|
||||
uint32_t pcie_bandwidth; //!< current PCIe bandwidth Mb/s
|
||||
uint32_t pcie_bandwidth; //!< current instantaneous PCIe bandwidth in Mb/s
|
||||
uint64_t pcie_replay_count; //!< total number of the replays issued on the PCIe link
|
||||
uint64_t pcie_l0_to_recovery_count; //!< total number of times the PCIe link transitioned from L0 to the recovery state
|
||||
uint64_t pcie_replay_roll_over_count; //!< total number of replay rollovers issued on the PCIe link
|
||||
|
||||
@@ -21,6 +21,9 @@
|
||||
#ifndef AMD_SMI_INCLUDE_AMD_SMI_UTILS_H_
|
||||
#define AMD_SMI_INCLUDE_AMD_SMI_UTILS_H_
|
||||
|
||||
#include <limits>
|
||||
#include <type_traits>
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "amd_smi/impl/amd_smi_gpu_device.h"
|
||||
#include "rocm_smi/rocm_smi_utils.h"
|
||||
@@ -45,4 +48,55 @@ amdsmi_status_t smi_amdgpu_get_pcie_speed_from_pcie_type(uint16_t pcie_type, uin
|
||||
amdsmi_status_t smi_amdgpu_get_market_name_from_dev_id(uint32_t device_id, char *market_name);
|
||||
amdsmi_status_t smi_amdgpu_is_gpu_power_management_enabled(amd::smi::AMDSmiGPUDevice* device, bool *enabled);
|
||||
|
||||
|
||||
template<typename>
|
||||
constexpr bool is_dependent_false_v = false;
|
||||
|
||||
template<typename T>
|
||||
inline constexpr bool is_supported_type_v = (
|
||||
std::is_same_v<std::remove_cv_t<std::remove_reference_t<T>>, std::uint8_t> ||
|
||||
std::is_same_v<std::remove_cv_t<std::remove_reference_t<T>>, std::uint16_t> ||
|
||||
std::is_same_v<std::remove_cv_t<std::remove_reference_t<T>>, std::uint32_t> ||
|
||||
std::is_same_v<std::remove_cv_t<std::remove_reference_t<T>>, std::uint64_t>
|
||||
);
|
||||
|
||||
template<typename T>
|
||||
constexpr T get_std_num_limit()
|
||||
{
|
||||
if constexpr (is_supported_type_v<T>) {
|
||||
return std::numeric_limits<T>::max();
|
||||
}
|
||||
else {
|
||||
return std::numeric_limits<T>::min();
|
||||
static_assert(is_dependent_false_v<T>, "Error: Type not supported...");
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
constexpr bool is_std_num_limit(T value)
|
||||
{
|
||||
return (value == get_std_num_limit<T>());
|
||||
}
|
||||
|
||||
template<typename T, typename U, typename V = T>
|
||||
constexpr T translate_umax_or_assign_value(U source_value, V target_value)
|
||||
{
|
||||
T result{};
|
||||
if constexpr (is_supported_type_v<T> && is_supported_type_v<U>) {
|
||||
// If the source value is uint<U>::max(), then return is uint<T>::max()
|
||||
if (is_std_num_limit(source_value)) {
|
||||
result = get_std_num_limit<T>();
|
||||
} else {
|
||||
result = static_cast<T>(target_value);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
else {
|
||||
static_assert(is_dependent_false_v<T>, "Error: Type not supported...");
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
#endif //
|
||||
|
||||
@@ -580,7 +580,7 @@ Output: Dictionary with fields
|
||||
|
||||
Field | Description
|
||||
---|---
|
||||
`fw_list`| List of dictionaries that contain information about a certain firmware block
|
||||
`fw_list` | List of dictionaries that contain information about a certain firmware block
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_fw_info` function:
|
||||
|
||||
@@ -619,7 +619,7 @@ Output: Dictionary of activites to their respective usage percentage or 'N/A' if
|
||||
|
||||
Field | Description
|
||||
---|---
|
||||
`gfx_activity`| graphics engine usage percentage (0 - 100)
|
||||
`gfx_activity` | graphics engine usage percentage (0 - 100)
|
||||
`umc_activity` | memory engine usage percentage (0 - 100)
|
||||
`mm_activity` | average multimedia engine usages in percentage (0 - 100)
|
||||
|
||||
@@ -659,7 +659,7 @@ Output: Dictionary with fields
|
||||
|
||||
Field | Description
|
||||
---|---
|
||||
`average_socket_power`| average socket power
|
||||
`average_socket_power` | average socket power
|
||||
`gfx_voltage` | voltage gfx
|
||||
`power_limit` | power limit
|
||||
|
||||
@@ -699,7 +699,7 @@ Output: Dictionary with fields
|
||||
Field | Description
|
||||
---|---
|
||||
`vram_total` | VRAM total
|
||||
`vram_used`| VRAM currently in use
|
||||
`vram_used` | VRAM currently in use
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_gpu_vram_usage` function:
|
||||
|
||||
@@ -751,7 +751,7 @@ Output: Dictionary with fields
|
||||
|
||||
Field | Description
|
||||
---|---
|
||||
`cur_clk`| Current clock for given clock type
|
||||
`cur_clk` | Current clock for given clock type
|
||||
`max_clk` | Maximum clock for given clock type
|
||||
`min_clk` | Minimum clock for given clock type
|
||||
|
||||
@@ -780,20 +780,19 @@ except AmdSmiException as e:
|
||||
|
||||
### amdsmi_get_pcie_info
|
||||
|
||||
Description: Returns the pcie link status for the given GPU.
|
||||
Description: Returns the pcie metric and static information for the given GPU.
|
||||
It is not supported on virtual machine guest
|
||||
|
||||
Input parameters:
|
||||
|
||||
* `processor_handle` device which to query
|
||||
|
||||
Output: Dictionary with fields
|
||||
Output: Dictionary with 2 fields `pcie_static` and `pcie_metric`
|
||||
|
||||
Field | Description
|
||||
Fields | Description
|
||||
---|---
|
||||
`pcie_width`| pcie lanes in use
|
||||
`pcie_speed`| current pcie speed
|
||||
`pcie_interface_version`| current pcie generation
|
||||
`pcie_static` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`max_pcie_width`</td><td>Maximum number of pcie lanes available</td></tr><tr><td>`max_pcie_speed`</td><td>Maximum capable pcie speed in GT/s</td></tr><tr><td>`pcie_interface_version`</td><td>PCIe generation ie. 3,4,5...</td></tr><tr><td>`slot_type`</td><td>The type of form factor of the slot: PCIE, OAM, or Unknown</td></tr></tbody></table>
|
||||
`pcie_metric` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`pcie_width`</td><td>Current number of pcie lanes available</td></tr><tr><td>`pcie_speed`</td><td>Current pcie speed capable in GT/s</td></tr><tr><td>`pcie_bandwidth`</td><td>Current instantaneous bandwidth usage in Mb/s</td></tr><tr><td>`pcie_replay_count`</td><td>Total number of PCIe replays (NAKs)</td></tr><tr><td>`pcie_l0_to_recovery_count`</td><td>PCIE L0 to recovery state transition accumulated count</td></tr><tr><td>`pcie_replay_roll_over_count`</td><td>PCIe Replay accumulated count</td></tr><tr><td>`pcie_nak_sent_count`</td><td>PCIe NAK sent accumulated count</td></tr><tr><td>`pcie_nak_received_count`</td><td>PCIe NAK received accumulated count</td></tr></tbody></table>
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_pcie_info` function:
|
||||
|
||||
@@ -810,10 +809,9 @@ try:
|
||||
print("No GPUs on machine")
|
||||
else:
|
||||
for device in devices:
|
||||
pcie_link_status = amdsmi_get_pcie_info(device)
|
||||
print(pcie_link_status["pcie_width"])
|
||||
print(pcie_link_status["pcie_speed"])
|
||||
print(pcie_link_status["pcie_interface_version"])
|
||||
pcie_info = amdsmi_get_pcie_info(device)
|
||||
print(pcie_info["pcie_static"])
|
||||
print(pcie_info["pcie_metric"])
|
||||
except AmdSmiException as e:
|
||||
print(e)
|
||||
```
|
||||
@@ -949,8 +947,8 @@ Output: Dictionary with fields
|
||||
|
||||
Field | Description
|
||||
---|---
|
||||
`correctable_count`| Correctable ECC error count
|
||||
`uncorrectable_count`| Uncorrectable ECC error count
|
||||
`correctable_count` | Correctable ECC error count
|
||||
`uncorrectable_count` | Uncorrectable ECC error count
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_gpu_total_ecc_count` function:
|
||||
|
||||
@@ -2021,9 +2019,9 @@ Output: Dictionary with fields
|
||||
|
||||
Field | Description
|
||||
---|---
|
||||
`num_supported`| The number of supported frequencies
|
||||
`current`| The current frequency index
|
||||
`frequency`| List of frequencies, only the first num_supported frequencies are valid
|
||||
`num_supported` | The number of supported frequencies
|
||||
`current` | The current frequency index
|
||||
`frequency` | List of frequencies, only the first num_supported frequencies are valid
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_clk_freq` function:
|
||||
|
||||
@@ -2062,8 +2060,8 @@ Field | Description
|
||||
`curr_mclk_range` | <table> <thead><tr><th> Subfield </th><th>Description</th></tr></thead><tbody><tr><td>`lower_bound`</td><td>lower bound mclk range</td></tr><tr><td>`upper_bound`</td><td>upper bound mclk range</td></tr></tbody></table>
|
||||
`sclk_freq_limits` | <table> <thead><tr><th> Subfield </th><th>Description</th></tr></thead><tbody><tr><td>`lower_bound`</td><td>lower bound sclk range limt</td></tr><tr><td>`upper_bound`</td><td>upper bound sclk range limit</td></tr></tbody></table>
|
||||
`mclk_freq_limits` | <table> <thead><tr><th> Subfield </th><th>Description</th></tr></thead><tbody><tr><td>`lower_bound`</td><td>lower bound mclk range limit</td></tr><tr><td>`upper_bound`</td><td>upper bound mclk range limit</td></tr></tbody></table>
|
||||
`curve.vc_points`| The number of supported frequencies
|
||||
`num_regions`| The current frequency index
|
||||
`curve.vc_points` | The number of supported frequencies
|
||||
`num_regions` | The current frequency index
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_gpu_od_volt_info` function:
|
||||
|
||||
@@ -2228,9 +2226,9 @@ Output: Dictionary with fields
|
||||
|
||||
Field | Description
|
||||
---|---
|
||||
`available_profiles`| Which profiles are supported by this system
|
||||
`current`| Which power profile is currently active
|
||||
`num_profiles`| How many power profiles are available
|
||||
`available_profiles` | Which profiles are supported by this system
|
||||
`current` | Which power profile is currently active
|
||||
`num_profiles` | How many power profiles are available
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_gpu_power_profile_presets` function:
|
||||
|
||||
@@ -2391,9 +2389,9 @@ Output: Dictionary with fields
|
||||
|
||||
Field | Description
|
||||
---|---
|
||||
`value`| Counter value
|
||||
`time_enabled`| Time that the counter was enabled in nanoseconds
|
||||
`time_running`| Time that the counter was running in nanoseconds
|
||||
`value` | Counter value
|
||||
`time_enabled` | Time that the counter was enabled in nanoseconds
|
||||
`time_running` | Time that the counter was running in nanoseconds
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_gpu_read_counter` function:
|
||||
|
||||
@@ -2661,8 +2659,8 @@ Output: Dict containing information about error counts
|
||||
|
||||
Field | Description
|
||||
---|---
|
||||
`correctable_count`| Count of correctable errors
|
||||
`uncorrectable_count`| Count of uncorrectable errors
|
||||
`correctable_count` | Count of correctable errors
|
||||
`uncorrectable_count` | Count of uncorrectable errors
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_gpu_ecc_count` function:
|
||||
|
||||
|
||||
@@ -2134,7 +2134,7 @@ def amdsmi_get_pcie_info(
|
||||
)
|
||||
)
|
||||
|
||||
return {
|
||||
pcie_info_dict = {
|
||||
"pcie_static": {
|
||||
"max_pcie_width": pcie_info.pcie_static.max_pcie_width,
|
||||
"max_pcie_speed": pcie_info.pcie_static.max_pcie_speed,
|
||||
@@ -2153,6 +2153,49 @@ def amdsmi_get_pcie_info(
|
||||
}
|
||||
}
|
||||
|
||||
# Check pcie static values for uint max
|
||||
if pcie_info_dict['pcie_static']['max_pcie_width'] == 0xFFFF:
|
||||
pcie_info_dict['pcie_static']['max_pcie_width'] = "N/A"
|
||||
if pcie_info_dict['pcie_static']['max_pcie_speed'] == 0xFFFFFFFF:
|
||||
pcie_info_dict['pcie_static']['max_pcie_speed'] = "N/A"
|
||||
if pcie_info_dict['pcie_static']['pcie_interface_version'] == 0xFFFFFFFF:
|
||||
pcie_info_dict['pcie_static']['pcie_interface_version'] = "N/A"
|
||||
|
||||
slot_type = pcie_info_dict['pcie_static']['slot_type']
|
||||
if isinstance(slot_type, int):
|
||||
slot_types = amdsmi_wrapper.amdsmi_card_form_factor_t__enumvalues
|
||||
if slot_type in slot_types:
|
||||
pcie_info_dict['pcie_static']['slot_type'] = slot_types[slot_type].replace("AMDSMI_CARD_FORM_FACTOR_", "")
|
||||
else:
|
||||
pcie_info_dict['pcie_static']['slot_type'] = "Unknown"
|
||||
else:
|
||||
pcie_info_dict['pcie_static']['slot_type'] = "N/A"
|
||||
|
||||
# Check pcie metric values for uint max
|
||||
if pcie_info_dict['pcie_metric']['pcie_width'] == 0xFFFF:
|
||||
pcie_info_dict['pcie_metric']['pcie_width'] = "N/A"
|
||||
if pcie_info_dict['pcie_metric']['pcie_speed'] == 0xFFFFFFFF:
|
||||
pcie_info_dict['pcie_metric']['pcie_speed'] = "N/A"
|
||||
if pcie_info_dict['pcie_metric']['pcie_bandwidth'] == 0xFFFFFFFF:
|
||||
pcie_info_dict['pcie_metric']['pcie_bandwidth'] = "N/A"
|
||||
|
||||
# TODO Just Navi 21 has a different uint max size for pcie_bandwidth
|
||||
# if pcie_info_dict['pcie_metric']['pcie_bandwidth'] == 0xFFFFFFFF:
|
||||
# pcie_info_dict['pcie_metric']['pcie_bandwidth'] = "N/A"
|
||||
|
||||
if pcie_info_dict['pcie_metric']['pcie_replay_count'] == 0xFFFFFFFFFFFFFFFF:
|
||||
pcie_info_dict['pcie_metric']['pcie_replay_count'] = "N/A"
|
||||
if pcie_info_dict['pcie_metric']['pcie_l0_to_recovery_count'] == 0xFFFFFFFFFFFFFFFF:
|
||||
pcie_info_dict['pcie_metric']['pcie_l0_to_recovery_count'] = "N/A"
|
||||
if pcie_info_dict['pcie_metric']['pcie_replay_roll_over_count'] == 0xFFFFFFFFFFFFFFFF:
|
||||
pcie_info_dict['pcie_metric']['pcie_replay_roll_over_count'] = "N/A"
|
||||
if pcie_info_dict['pcie_metric']['pcie_nak_sent_count'] == 0xFFFFFFFFFFFFFFFF:
|
||||
pcie_info_dict['pcie_metric']['pcie_nak_sent_count'] = "N/A"
|
||||
if pcie_info_dict['pcie_metric']['pcie_nak_received_count'] == 0xFFFFFFFFFFFFFFFF:
|
||||
pcie_info_dict['pcie_metric']['pcie_nak_received_count'] = "N/A"
|
||||
|
||||
return pcie_info_dict
|
||||
|
||||
|
||||
def amdsmi_get_processor_handle_from_bdf(bdf):
|
||||
bdf = _parse_bdf(bdf)
|
||||
@@ -3275,7 +3318,7 @@ def amdsmi_get_dpm_policy(
|
||||
processor_handle, ctypes.byref(policy)
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
polices = []
|
||||
for i in range(0, policy.num_supported):
|
||||
id = policy.policies[i].policy_id
|
||||
|
||||
@@ -2052,9 +2052,33 @@ amdsmi_status_t amdsmi_get_pcie_info(amdsmi_processor_handle processor_handle, a
|
||||
status = smi_amdgpu_get_pcie_speed_from_pcie_type(metric_info.pcie_link_speed, &info->pcie_metric.pcie_speed); // mapping to MT/s
|
||||
} else {
|
||||
// gpu metrics returns pcie link speed in .1 GT/s ex. 160 vs 16
|
||||
info->pcie_metric.pcie_speed = metric_info.pcie_link_speed * 100;
|
||||
info->pcie_metric.pcie_speed = translate_umax_or_assign_value<decltype(info->pcie_metric.pcie_speed)>
|
||||
(metric_info.pcie_link_speed, (metric_info.pcie_link_speed * 100));
|
||||
}
|
||||
|
||||
// additional pcie related metrics
|
||||
/**
|
||||
* pcie_metric.pcie_bandwidth: MB/s (uint32_t)
|
||||
* metric_info.pcie_bandwidth_inst: GB/s (uint64_t)
|
||||
*/
|
||||
info->pcie_metric.pcie_bandwidth = translate_umax_or_assign_value<decltype(info->pcie_metric.pcie_bandwidth)>
|
||||
(metric_info.pcie_bandwidth_inst, metric_info.pcie_bandwidth_inst);
|
||||
info->pcie_metric.pcie_replay_count = metric_info.pcie_replay_count_acc;
|
||||
info->pcie_metric.pcie_l0_to_recovery_count = metric_info.pcie_l0_to_recov_count_acc;
|
||||
info->pcie_metric.pcie_replay_roll_over_count = metric_info.pcie_replay_rover_count_acc;
|
||||
/**
|
||||
* pcie_metric.pcie_nak_received_count: (uint64_t)
|
||||
* metric_info.pcie_nak_rcvd_count_acc: (uint32_t)
|
||||
*/
|
||||
info->pcie_metric.pcie_nak_received_count = translate_umax_or_assign_value<decltype(info->pcie_metric.pcie_nak_received_count)>
|
||||
(metric_info.pcie_nak_rcvd_count_acc, (metric_info.pcie_nak_rcvd_count_acc));
|
||||
/**
|
||||
* pcie_metric.pcie_nak_sent_count: (uint64_t)
|
||||
* metric_info.pcie_nak_sent_count_acc: (uint32_t)
|
||||
*/
|
||||
info->pcie_metric.pcie_nak_sent_count = translate_umax_or_assign_value<decltype(info->pcie_metric.pcie_nak_sent_count)>
|
||||
(metric_info.pcie_nak_sent_count_acc, (metric_info.pcie_nak_sent_count_acc));
|
||||
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user