[SWDEV-530633] Use gpu_metric speed and BW for xgmi (#366)

The xgmi command was showing pcie bit rate and bandwidth instead of xgmi. Corrected the API to get xgmi data from gpu metric.
Added python API for amdsmi_get_link_metrics. Modified the amdsmi_link_metrics struct.
Added check to confirm non zero partition got xgmi command.

---------

Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>
Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Этот коммит содержится в:
Kanangot Balakrishnan, Bindhiya
2025-05-30 16:51:11 -05:00
коммит произвёл GitHub
родитель 2e8aaf02c9
Коммит 2eff0b3764
7 изменённых файлов: 147 добавлений и 31 удалений
+50 -19
Просмотреть файл
@@ -40,7 +40,7 @@ class AMDSMICommands():
Each command function will interact with AMDSMILogger to handle
displaying the output to the specified format and destination.
"""
def __init__(self, format='human_readable', destination='stdout') -> None:
self.helpers = AMDSMIHelpers()
self.logger = AMDSMILogger(format=format, destination=destination)
@@ -2024,7 +2024,7 @@ class AMDSMICommands():
except KeyError as e:
logging.debug("Failed to get current_socclk for gpu %s | %s", gpu_id, e)
# Populate the max and min clock values from sysfs.
# Min and Max values are per clock type, not per clock engine.
# Populate the deep sleep value from amdsmi_get_clock_info
@@ -2075,7 +2075,7 @@ class AMDSMICommands():
# Iterate through the maximum number of VCLK clocks supported
for index in range(amdsmi_interface.AMDSMI_MAX_NUM_CLKS):
vclk_index = f"vclk_{index}" # Construct the index key for the clock
# Check if the current clock value is not "N/A"
if clocks[vclk_index]["clk"] != "N/A":
# Format and assign the minimum clock value for the current VCLK
@@ -4480,7 +4480,7 @@ class AMDSMICommands():
future_set_count = self.helpers.get_set_count()
if current_set_count == future_set_count-1:
self.logger.store_output(args.gpu, 'accelerator_partition', f"Successfully set accelerator partition to {user_requested_partition_args}")
except amdsmi_exception.AmdSmiLibraryException as e:
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
@@ -5454,7 +5454,7 @@ class AMDSMICommands():
self.logger.table_header += 'MEM%'.rjust(7)
# don't populate mem clock on default output
# don't populate mem clock on default output
if not args.default_output:
try:
mem_clock = gpu_metrics_info['current_uclk']
@@ -5875,6 +5875,17 @@ class AMDSMICommands():
# Populate the possible gpus and their bdfs
xgmi_values = []
for gpu in args.gpu:
partition_id = -1
try:
kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(gpu)
partition_id = kfd_info['current_partition_id']
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get kfd info for gpu %s | %s", gpu, e.get_error_info())
if partition_id != 0:
logging.debug(f"Skipping xgmi command due to non zero partition {gpu} - {partition_id}")
continue
logging.debug("check1 device_handle: %s", gpu)
gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu)
gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(gpu)
@@ -5906,14 +5917,9 @@ class AMDSMICommands():
}
try:
pcie_static = amdsmi_interface.amdsmi_get_pcie_info(src_gpu)['pcie_static']
if pcie_static['max_pcie_speed'] % 1000 != 0:
pcie_speed_GTs_value = round(pcie_static['max_pcie_speed'] / 1000, 1)
else:
pcie_speed_GTs_value = round(pcie_static['max_pcie_speed'] / 1000)
bitrate = pcie_speed_GTs_value
max_bandwidth = bitrate * pcie_static['max_pcie_width']
xgmi_metrics_info = amdsmi_interface.amdsmi_get_link_metrics(src_gpu)
bitrate = xgmi_metrics_info['bit_rate']
max_bandwidth = xgmi_metrics_info['max_bandwidth']
except amdsmi_exception.AmdSmiLibraryException as e:
bitrate = "N/A"
max_bandwidth = "N/A"
@@ -5935,7 +5941,18 @@ class AMDSMICommands():
xgmi_dict['link_metrics']['max_bandwidth'] = max_bandwidth
# Populate link metrics
link_num = 0
for dest_gpu in args.gpu:
partition_id = -1
try:
kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(dest_gpu)
partition_id = kfd_info['current_partition_id']
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get kfd info for gpu %s | %s", dest_gpu, e.get_error_info())
if partition_id != 0:
continue
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)
dest_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(dest_gpu)
dest_link_dict = {
@@ -5954,10 +5971,10 @@ class AMDSMICommands():
try:
# Get the read write relative to the source gpu
metrics_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(src_gpu)
read = metrics_info['xgmi_read_data_acc'][dest_gpu_id]
write = metrics_info['xgmi_write_data_acc'][dest_gpu_id]
except amdsmi_exception.AmdSmiLibraryException as e:
read = xgmi_metrics_info['links'][link_num]['read']
write = xgmi_metrics_info['links'][link_num]['write']
link_num += 1
except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e:
read = "N/A"
write = "N/A"
logging.debug("Failed to get read data for %s to %s | %s",
@@ -6087,7 +6104,21 @@ class AMDSMICommands():
self.logger.print_output(multiple_device_enabled=True, tabular=True)
self.logger.clear_multiple_devices_output()
if self.logger.is_human_readable_format():
print("\n* U:Up D:Down X:Disabled".ljust(13))
# Populate the legend output
legend_parts = [
"\n\nLegend:",
" SELF = Current GPU",
" N/A = Not supported",
" U / D / X = Link is Up / Down / Disabled",
" Read / Write = GPU Metric Accumulated Read / Write"
]
legend_output = "\n".join(legend_parts)
if self.logger.destination == 'stdout':
print(legend_output)
else:
with self.logger.destination.open('a', encoding="utf-8") as output_file:
output_file.write(legend_output + '\n')
def partition(self, args, multiple_devices=False, gpu=None, current=None, memory=None, accelerator=None):
@@ -6385,7 +6416,7 @@ class AMDSMICommands():
continue
resource_index = 0
for p in range(0, num_profiles):
for p in range(0, num_profiles):
for r in range(0, num_resource_profiles):
resource_type = partition_config_dict['profiles'][p]['resources'][r]['resource_type']
resource_instances = partition_config_dict['profiles'][p]['resources'][r]['partition_resource']
+43
Просмотреть файл
@@ -3819,6 +3819,49 @@ except AmdSmiException as e:
print(e)
```
### amdsmi_get_link_metrics
Description: Returns XGMI link metrics information for the given GPU.
Input parameters:
* `processor_handle` — The device handle for which to query link metrics.
Output: Dictionary with fields
Field | Description
---|---
`num_links` | Number of XGMI links reported
`bit_rate` | XGMI link bit rate (in appropriate units, e.g., Gbps)
`max_bandwidth` | Maximum XGMI bandwidth (in appropriate units, e.g., GB/s)
`links` | List of dictionaries, one per XGMI link, each with:
`bdf` | BDF string for the destination
`link_type` | Link type
`read` | Accumulated read data for this link (e.g., KB)
`write` | Accumulated write data for this link (e.g., KB)
Exceptions that can be thrown by `amdsmi_get_link_metrics` function:
* `AmdSmiLibraryException`
* `AmdSmiParameterException`
Example:
```python
try:
devices = amdsmi_get_processor_handles()
if len(devices) == 0:
print("No GPUs on machine")
else:
for device in devices:
link_metrics = amdsmi_get_link_metrics(device)
print(link_metrics['bit_rate'])
print(link_metrics['max_bandwidth'])
for idx, link in enumerate(link_metrics['links']):
print(f"{idx}: {link['bdf']}, {link['link_type']}, {link['read']} KB, {link['write']} KB")
except AmdSmiException as e:
print(e)
### amdsmi_topo_get_link_type
Description: Retrieve the hops and the connection type between 2 GPUs
+7 -7
Просмотреть файл
@@ -967,14 +967,14 @@ typedef struct {
* @cond @tag{gpu_bm_linux} @endcond
*/
typedef struct {
uint32_t num_links; //!< number of links
uint32_t num_links; //!< number of links
uint32_t bit_rate; //!< current link speed in Gb/s
uint32_t max_bandwidth; //!< max bandwidth of the link in Gb/s
struct _links {
amdsmi_bdf_t bdf;
uint32_t bit_rate; //!< current link speed in Gb/s
uint32_t max_bandwidth; //!< max bandwidth of the link in Gb/s
amdsmi_link_type_t link_type; //!< type of the link
uint64_t read; //!< total data received for each link in KB
uint64_t write; //!< total data transfered for each link in KB
amdsmi_bdf_t bdf; //!< bdf of the destination gpu
amdsmi_link_type_t link_type; //!< type of the link
uint64_t read; //!< total data received for each link in KB
uint64_t write; //!< total data transfered for each link in KB
uint64_t reserved[2];
} links[AMDSMI_MAX_NUM_XGMI_PHYSICAL_LINK];
uint64_t reserved[7];
+1
Просмотреть файл
@@ -221,6 +221,7 @@ from .amdsmi_interface import amdsmi_get_gpu_subsystem_name
from .amdsmi_interface import amdsmi_topo_get_numa_node_number
from .amdsmi_interface import amdsmi_topo_get_link_weight
from .amdsmi_interface import amdsmi_get_minmax_bandwidth_between_processors
from .amdsmi_interface import amdsmi_get_link_metrics
from .amdsmi_interface import amdsmi_topo_get_link_type
from .amdsmi_interface import amdsmi_topo_get_p2p_status
from .amdsmi_interface import amdsmi_is_P2P_accessible
+35
Просмотреть файл
@@ -3099,6 +3099,41 @@ def amdsmi_get_minmax_bandwidth_between_processors(
return {"min_bandwidth": min_bandwidth.value, "max_bandwidth": max_bandwidth.value}
def amdsmi_get_link_metrics(processor_handle: amdsmi_wrapper.amdsmi_processor_handle):
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
raise AmdSmiParameterException(
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
)
link_metrics = amdsmi_wrapper.amdsmi_link_metrics_t()
_check_res(
amdsmi_wrapper.amdsmi_get_link_metrics(
processor_handle, ctypes.byref(link_metrics)
)
)
bdf = amdsmi_wrapper.amdsmi_bdf_t()
# TODO: Dummy BDF - to be replaced with destination BDF from xgmi_port_num when available
bdf.struct_amdsmi_bdf_t = amdsmi_wrapper.struct_amdsmi_bdf_t(0xFFFF, 0xFF, 0xFF, 0xF)
links = []
for i in range(AMDSMI_MAX_NUM_XGMI_LINKS):
link = link_metrics.links[i]
links.append({
"bdf": _format_bdf(bdf),
"link_type": link.link_type,
"read": link.read,
"write": link.write,
})
return {
"num_links": AMDSMI_MAX_NUM_XGMI_LINKS,
"bit_rate": link_metrics.bit_rate,
"max_bandwidth": link_metrics.max_bandwidth,
"links": links
}
def amdsmi_topo_get_link_type(
processor_handle_src: amdsmi_wrapper.amdsmi_processor_handle,
processor_handle_dst: amdsmi_wrapper.amdsmi_processor_handle,
+2 -2
Просмотреть файл
@@ -1153,8 +1153,6 @@ class struct__links(Structure):
struct__links._pack_ = 1 # source:False
struct__links._fields_ = [
('bdf', amdsmi_bdf_t),
('bit_rate', ctypes.c_uint32),
('max_bandwidth', ctypes.c_uint32),
('link_type', amdsmi_link_type_t),
('PADDING_0', ctypes.c_ubyte * 4),
('read', ctypes.c_uint64),
@@ -1165,6 +1163,8 @@ struct__links._fields_ = [
struct_amdsmi_link_metrics_t._pack_ = 1 # source:False
struct_amdsmi_link_metrics_t._fields_ = [
('num_links', ctypes.c_uint32),
('bit_rate', ctypes.c_uint32),
('max_bandwidth', ctypes.c_uint32),
('PADDING_0', ctypes.c_ubyte * 4),
('links', struct__links * 64),
('reserved', ctypes.c_uint64 * 7),
+9 -3
Просмотреть файл
@@ -2119,16 +2119,22 @@ amdsmi_status_t amdsmi_get_link_metrics(amdsmi_processor_handle processor_handle
if (link_metrics == nullptr) return AMDSMI_STATUS_INVAL;
amdsmi_gpu_metrics_t metric_info = {};
link_metrics->max_bandwidth = std::numeric_limits<uint32_t>::max();
amdsmi_status_t status = amdsmi_get_gpu_metrics_info(
processor_handle, &metric_info);
if (status != AMDSMI_STATUS_SUCCESS)
return status;
link_metrics->num_links = AMDSMI_MAX_NUM_XGMI_LINKS;
for (unsigned int i = 0; i < link_metrics->num_links; i++) {
link_metrics->bit_rate = metric_info.xgmi_link_speed;
if ((metric_info.xgmi_link_speed != std::numeric_limits<uint16_t>::max()) &&
(metric_info.xgmi_link_width != std::numeric_limits<uint16_t>::max()))
link_metrics->max_bandwidth = metric_info.xgmi_link_speed * metric_info.xgmi_link_width;
for (unsigned int i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; i++) {
link_metrics->links[i].read = metric_info.xgmi_read_data_acc[i];
link_metrics->links[i].write = metric_info.xgmi_write_data_acc[i];
link_metrics->links[i].bit_rate = metric_info.xgmi_link_speed;
link_metrics->links[i].max_bandwidth = metric_info.xgmi_link_width;
link_metrics->links[i].link_type = AMDSMI_LINK_TYPE_XGMI;
}