[SWDEV-530633] Use gpu_metric speed and BW for xgmi (#366)
The xgmi command was showing pcie bit rate and bandwidth instead of xgmi. Corrected the API to get xgmi data from gpu metric. Added python API for amdsmi_get_link_metrics. Modified the amdsmi_link_metrics struct. Added check to confirm non zero partition got xgmi command. --------- Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com> Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
2e8aaf02c9
Коммит
2eff0b3764
@@ -40,7 +40,7 @@ class AMDSMICommands():
|
||||
Each command function will interact with AMDSMILogger to handle
|
||||
displaying the output to the specified format and destination.
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self, format='human_readable', destination='stdout') -> None:
|
||||
self.helpers = AMDSMIHelpers()
|
||||
self.logger = AMDSMILogger(format=format, destination=destination)
|
||||
@@ -2024,7 +2024,7 @@ class AMDSMICommands():
|
||||
except KeyError as e:
|
||||
logging.debug("Failed to get current_socclk for gpu %s | %s", gpu_id, e)
|
||||
|
||||
|
||||
|
||||
# Populate the max and min clock values from sysfs.
|
||||
# Min and Max values are per clock type, not per clock engine.
|
||||
# Populate the deep sleep value from amdsmi_get_clock_info
|
||||
@@ -2075,7 +2075,7 @@ class AMDSMICommands():
|
||||
# Iterate through the maximum number of VCLK clocks supported
|
||||
for index in range(amdsmi_interface.AMDSMI_MAX_NUM_CLKS):
|
||||
vclk_index = f"vclk_{index}" # Construct the index key for the clock
|
||||
|
||||
|
||||
# Check if the current clock value is not "N/A"
|
||||
if clocks[vclk_index]["clk"] != "N/A":
|
||||
# Format and assign the minimum clock value for the current VCLK
|
||||
@@ -4480,7 +4480,7 @@ class AMDSMICommands():
|
||||
future_set_count = self.helpers.get_set_count()
|
||||
if current_set_count == future_set_count-1:
|
||||
self.logger.store_output(args.gpu, 'accelerator_partition', f"Successfully set accelerator partition to {user_requested_partition_args}")
|
||||
|
||||
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
|
||||
raise PermissionError('Command requires elevation') from e
|
||||
@@ -5454,7 +5454,7 @@ class AMDSMICommands():
|
||||
|
||||
self.logger.table_header += 'MEM%'.rjust(7)
|
||||
|
||||
# don't populate mem clock on default output
|
||||
# don't populate mem clock on default output
|
||||
if not args.default_output:
|
||||
try:
|
||||
mem_clock = gpu_metrics_info['current_uclk']
|
||||
@@ -5875,6 +5875,17 @@ class AMDSMICommands():
|
||||
# Populate the possible gpus and their bdfs
|
||||
xgmi_values = []
|
||||
for gpu in args.gpu:
|
||||
partition_id = -1
|
||||
try:
|
||||
kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(gpu)
|
||||
partition_id = kfd_info['current_partition_id']
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get kfd info for gpu %s | %s", gpu, e.get_error_info())
|
||||
|
||||
if partition_id != 0:
|
||||
logging.debug(f"Skipping xgmi command due to non zero partition {gpu} - {partition_id}")
|
||||
continue
|
||||
|
||||
logging.debug("check1 device_handle: %s", gpu)
|
||||
gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu)
|
||||
gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(gpu)
|
||||
@@ -5906,14 +5917,9 @@ class AMDSMICommands():
|
||||
}
|
||||
|
||||
try:
|
||||
pcie_static = amdsmi_interface.amdsmi_get_pcie_info(src_gpu)['pcie_static']
|
||||
if pcie_static['max_pcie_speed'] % 1000 != 0:
|
||||
pcie_speed_GTs_value = round(pcie_static['max_pcie_speed'] / 1000, 1)
|
||||
else:
|
||||
pcie_speed_GTs_value = round(pcie_static['max_pcie_speed'] / 1000)
|
||||
|
||||
bitrate = pcie_speed_GTs_value
|
||||
max_bandwidth = bitrate * pcie_static['max_pcie_width']
|
||||
xgmi_metrics_info = amdsmi_interface.amdsmi_get_link_metrics(src_gpu)
|
||||
bitrate = xgmi_metrics_info['bit_rate']
|
||||
max_bandwidth = xgmi_metrics_info['max_bandwidth']
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
bitrate = "N/A"
|
||||
max_bandwidth = "N/A"
|
||||
@@ -5935,7 +5941,18 @@ class AMDSMICommands():
|
||||
xgmi_dict['link_metrics']['max_bandwidth'] = max_bandwidth
|
||||
|
||||
# Populate link metrics
|
||||
link_num = 0
|
||||
for dest_gpu in args.gpu:
|
||||
partition_id = -1
|
||||
try:
|
||||
kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(dest_gpu)
|
||||
partition_id = kfd_info['current_partition_id']
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get kfd info for gpu %s | %s", dest_gpu, e.get_error_info())
|
||||
|
||||
if partition_id != 0:
|
||||
continue
|
||||
|
||||
dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)
|
||||
dest_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(dest_gpu)
|
||||
dest_link_dict = {
|
||||
@@ -5954,10 +5971,10 @@ class AMDSMICommands():
|
||||
|
||||
try:
|
||||
# Get the read write relative to the source gpu
|
||||
metrics_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(src_gpu)
|
||||
read = metrics_info['xgmi_read_data_acc'][dest_gpu_id]
|
||||
write = metrics_info['xgmi_write_data_acc'][dest_gpu_id]
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
read = xgmi_metrics_info['links'][link_num]['read']
|
||||
write = xgmi_metrics_info['links'][link_num]['write']
|
||||
link_num += 1
|
||||
except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e:
|
||||
read = "N/A"
|
||||
write = "N/A"
|
||||
logging.debug("Failed to get read data for %s to %s | %s",
|
||||
@@ -6087,7 +6104,21 @@ class AMDSMICommands():
|
||||
self.logger.print_output(multiple_device_enabled=True, tabular=True)
|
||||
self.logger.clear_multiple_devices_output()
|
||||
if self.logger.is_human_readable_format():
|
||||
print("\n* U:Up D:Down X:Disabled".ljust(13))
|
||||
# Populate the legend output
|
||||
legend_parts = [
|
||||
"\n\nLegend:",
|
||||
" SELF = Current GPU",
|
||||
" N/A = Not supported",
|
||||
" U / D / X = Link is Up / Down / Disabled",
|
||||
" Read / Write = GPU Metric Accumulated Read / Write"
|
||||
]
|
||||
legend_output = "\n".join(legend_parts)
|
||||
|
||||
if self.logger.destination == 'stdout':
|
||||
print(legend_output)
|
||||
else:
|
||||
with self.logger.destination.open('a', encoding="utf-8") as output_file:
|
||||
output_file.write(legend_output + '\n')
|
||||
|
||||
|
||||
def partition(self, args, multiple_devices=False, gpu=None, current=None, memory=None, accelerator=None):
|
||||
@@ -6385,7 +6416,7 @@ class AMDSMICommands():
|
||||
continue
|
||||
|
||||
resource_index = 0
|
||||
for p in range(0, num_profiles):
|
||||
for p in range(0, num_profiles):
|
||||
for r in range(0, num_resource_profiles):
|
||||
resource_type = partition_config_dict['profiles'][p]['resources'][r]['resource_type']
|
||||
resource_instances = partition_config_dict['profiles'][p]['resources'][r]['partition_resource']
|
||||
|
||||
@@ -3819,6 +3819,49 @@ except AmdSmiException as e:
|
||||
print(e)
|
||||
```
|
||||
|
||||
### amdsmi_get_link_metrics
|
||||
|
||||
Description: Returns XGMI link metrics information for the given GPU.
|
||||
|
||||
Input parameters:
|
||||
|
||||
* `processor_handle` — The device handle for which to query link metrics.
|
||||
|
||||
Output: Dictionary with fields
|
||||
|
||||
Field | Description
|
||||
---|---
|
||||
`num_links` | Number of XGMI links reported
|
||||
`bit_rate` | XGMI link bit rate (in appropriate units, e.g., Gbps)
|
||||
`max_bandwidth` | Maximum XGMI bandwidth (in appropriate units, e.g., GB/s)
|
||||
`links` | List of dictionaries, one per XGMI link, each with:
|
||||
`bdf` | BDF string for the destination
|
||||
`link_type` | Link type
|
||||
`read` | Accumulated read data for this link (e.g., KB)
|
||||
`write` | Accumulated write data for this link (e.g., KB)
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_link_metrics` function:
|
||||
|
||||
* `AmdSmiLibraryException`
|
||||
* `AmdSmiParameterException`
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
try:
|
||||
devices = amdsmi_get_processor_handles()
|
||||
if len(devices) == 0:
|
||||
print("No GPUs on machine")
|
||||
else:
|
||||
for device in devices:
|
||||
link_metrics = amdsmi_get_link_metrics(device)
|
||||
print(link_metrics['bit_rate'])
|
||||
print(link_metrics['max_bandwidth'])
|
||||
for idx, link in enumerate(link_metrics['links']):
|
||||
print(f"{idx}: {link['bdf']}, {link['link_type']}, {link['read']} KB, {link['write']} KB")
|
||||
except AmdSmiException as e:
|
||||
print(e)
|
||||
|
||||
### amdsmi_topo_get_link_type
|
||||
|
||||
Description: Retrieve the hops and the connection type between 2 GPUs
|
||||
|
||||
@@ -967,14 +967,14 @@ typedef struct {
|
||||
* @cond @tag{gpu_bm_linux} @endcond
|
||||
*/
|
||||
typedef struct {
|
||||
uint32_t num_links; //!< number of links
|
||||
uint32_t num_links; //!< number of links
|
||||
uint32_t bit_rate; //!< current link speed in Gb/s
|
||||
uint32_t max_bandwidth; //!< max bandwidth of the link in Gb/s
|
||||
struct _links {
|
||||
amdsmi_bdf_t bdf;
|
||||
uint32_t bit_rate; //!< current link speed in Gb/s
|
||||
uint32_t max_bandwidth; //!< max bandwidth of the link in Gb/s
|
||||
amdsmi_link_type_t link_type; //!< type of the link
|
||||
uint64_t read; //!< total data received for each link in KB
|
||||
uint64_t write; //!< total data transfered for each link in KB
|
||||
amdsmi_bdf_t bdf; //!< bdf of the destination gpu
|
||||
amdsmi_link_type_t link_type; //!< type of the link
|
||||
uint64_t read; //!< total data received for each link in KB
|
||||
uint64_t write; //!< total data transfered for each link in KB
|
||||
uint64_t reserved[2];
|
||||
} links[AMDSMI_MAX_NUM_XGMI_PHYSICAL_LINK];
|
||||
uint64_t reserved[7];
|
||||
|
||||
@@ -221,6 +221,7 @@ from .amdsmi_interface import amdsmi_get_gpu_subsystem_name
|
||||
from .amdsmi_interface import amdsmi_topo_get_numa_node_number
|
||||
from .amdsmi_interface import amdsmi_topo_get_link_weight
|
||||
from .amdsmi_interface import amdsmi_get_minmax_bandwidth_between_processors
|
||||
from .amdsmi_interface import amdsmi_get_link_metrics
|
||||
from .amdsmi_interface import amdsmi_topo_get_link_type
|
||||
from .amdsmi_interface import amdsmi_topo_get_p2p_status
|
||||
from .amdsmi_interface import amdsmi_is_P2P_accessible
|
||||
|
||||
@@ -3099,6 +3099,41 @@ def amdsmi_get_minmax_bandwidth_between_processors(
|
||||
return {"min_bandwidth": min_bandwidth.value, "max_bandwidth": max_bandwidth.value}
|
||||
|
||||
|
||||
def amdsmi_get_link_metrics(processor_handle: amdsmi_wrapper.amdsmi_processor_handle):
|
||||
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
|
||||
raise AmdSmiParameterException(
|
||||
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
|
||||
)
|
||||
|
||||
link_metrics = amdsmi_wrapper.amdsmi_link_metrics_t()
|
||||
_check_res(
|
||||
amdsmi_wrapper.amdsmi_get_link_metrics(
|
||||
processor_handle, ctypes.byref(link_metrics)
|
||||
)
|
||||
)
|
||||
|
||||
bdf = amdsmi_wrapper.amdsmi_bdf_t()
|
||||
# TODO: Dummy BDF - to be replaced with destination BDF from xgmi_port_num when available
|
||||
bdf.struct_amdsmi_bdf_t = amdsmi_wrapper.struct_amdsmi_bdf_t(0xFFFF, 0xFF, 0xFF, 0xF)
|
||||
|
||||
links = []
|
||||
for i in range(AMDSMI_MAX_NUM_XGMI_LINKS):
|
||||
link = link_metrics.links[i]
|
||||
links.append({
|
||||
"bdf": _format_bdf(bdf),
|
||||
"link_type": link.link_type,
|
||||
"read": link.read,
|
||||
"write": link.write,
|
||||
})
|
||||
|
||||
return {
|
||||
"num_links": AMDSMI_MAX_NUM_XGMI_LINKS,
|
||||
"bit_rate": link_metrics.bit_rate,
|
||||
"max_bandwidth": link_metrics.max_bandwidth,
|
||||
"links": links
|
||||
}
|
||||
|
||||
|
||||
def amdsmi_topo_get_link_type(
|
||||
processor_handle_src: amdsmi_wrapper.amdsmi_processor_handle,
|
||||
processor_handle_dst: amdsmi_wrapper.amdsmi_processor_handle,
|
||||
|
||||
@@ -1153,8 +1153,6 @@ class struct__links(Structure):
|
||||
struct__links._pack_ = 1 # source:False
|
||||
struct__links._fields_ = [
|
||||
('bdf', amdsmi_bdf_t),
|
||||
('bit_rate', ctypes.c_uint32),
|
||||
('max_bandwidth', ctypes.c_uint32),
|
||||
('link_type', amdsmi_link_type_t),
|
||||
('PADDING_0', ctypes.c_ubyte * 4),
|
||||
('read', ctypes.c_uint64),
|
||||
@@ -1165,6 +1163,8 @@ struct__links._fields_ = [
|
||||
struct_amdsmi_link_metrics_t._pack_ = 1 # source:False
|
||||
struct_amdsmi_link_metrics_t._fields_ = [
|
||||
('num_links', ctypes.c_uint32),
|
||||
('bit_rate', ctypes.c_uint32),
|
||||
('max_bandwidth', ctypes.c_uint32),
|
||||
('PADDING_0', ctypes.c_ubyte * 4),
|
||||
('links', struct__links * 64),
|
||||
('reserved', ctypes.c_uint64 * 7),
|
||||
|
||||
@@ -2119,16 +2119,22 @@ amdsmi_status_t amdsmi_get_link_metrics(amdsmi_processor_handle processor_handle
|
||||
if (link_metrics == nullptr) return AMDSMI_STATUS_INVAL;
|
||||
|
||||
amdsmi_gpu_metrics_t metric_info = {};
|
||||
link_metrics->max_bandwidth = std::numeric_limits<uint32_t>::max();
|
||||
|
||||
amdsmi_status_t status = amdsmi_get_gpu_metrics_info(
|
||||
processor_handle, &metric_info);
|
||||
if (status != AMDSMI_STATUS_SUCCESS)
|
||||
return status;
|
||||
link_metrics->num_links = AMDSMI_MAX_NUM_XGMI_LINKS;
|
||||
for (unsigned int i = 0; i < link_metrics->num_links; i++) {
|
||||
|
||||
link_metrics->bit_rate = metric_info.xgmi_link_speed;
|
||||
if ((metric_info.xgmi_link_speed != std::numeric_limits<uint16_t>::max()) &&
|
||||
(metric_info.xgmi_link_width != std::numeric_limits<uint16_t>::max()))
|
||||
link_metrics->max_bandwidth = metric_info.xgmi_link_speed * metric_info.xgmi_link_width;
|
||||
|
||||
for (unsigned int i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; i++) {
|
||||
link_metrics->links[i].read = metric_info.xgmi_read_data_acc[i];
|
||||
link_metrics->links[i].write = metric_info.xgmi_write_data_acc[i];
|
||||
link_metrics->links[i].bit_rate = metric_info.xgmi_link_speed;
|
||||
link_metrics->links[i].max_bandwidth = metric_info.xgmi_link_width;
|
||||
link_metrics->links[i].link_type = AMDSMI_LINK_TYPE_XGMI;
|
||||
}
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user