Adjust policy for memory display on APUs (#1967)

* Read the ids_flags when fetching GPU info

The ids_flags contains the flags that can help identify if a GPU
is a dGPU or an APU.

* Show correct memory pool for APUs

The kernel policy for APUs will be to choose the bigger pool of
memory (GTT or VRAM) for KFD work.  Adjust the policy for the monitor
and default commands to show the right memory pool when using an APU.
Этот коммит содержится в:
Mario Limonciello
2025-12-09 21:49:06 -06:00
коммит произвёл GitHub
родитель 879d010974
Коммит 73778bf83c
8 изменённых файлов: 141 добавлений и 63 удалений
+54 -38
Просмотреть файл
@@ -6159,69 +6159,76 @@ class AMDSMICommands():
self.logger.table_header += 'PCIE_REPLAY'.rjust(13)
if args.vram_usage and not args.default_output:
mem_type, mem_type_name = self.helpers.get_apu_memory_type_and_name(args.gpu, gpu_id)
try:
vram_used = amdsmi_interface.amdsmi_get_gpu_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.VRAM) // (1024*1024)
vram_total = amdsmi_interface.amdsmi_get_gpu_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.VRAM) // (1024*1024)
monitor_values['vram_used'] = vram_used
monitor_values['vram_free'] = vram_total - vram_used
monitor_values['vram_total'] = vram_total
if vram_total != 0:
monitor_values['vram_percent'] = round ((vram_used / vram_total) * 100, 2)
mem_used = amdsmi_interface.amdsmi_get_gpu_memory_usage(args.gpu, mem_type) // (1024*1024)
mem_total = amdsmi_interface.amdsmi_get_gpu_memory_total(args.gpu, mem_type) // (1024*1024)
monitor_values['vram_used'] = mem_used
monitor_values['vram_free'] = mem_total - mem_used
monitor_values['vram_total'] = mem_total
if mem_total != 0:
monitor_values['vram_percent'] = round ((mem_used / mem_total) * 100, 2)
else:
monitor_values['vram_percent'] = "N/A"
vram_usage_unit = "MB"
vram_percent_unit = "%"
mem_usage_unit = "MB"
mem_percent_unit = "%"
if self.logger.is_human_readable_format():
monitor_values['vram_used'] = f"{monitor_values['vram_used']} {vram_usage_unit}"
monitor_values['vram_free'] = f"{monitor_values['vram_free']} {vram_usage_unit}"
monitor_values['vram_total'] = f"{monitor_values['vram_total']} {vram_usage_unit}"
monitor_values['vram_percent'] = f"{monitor_values['vram_percent']} {vram_percent_unit}"
monitor_values['vram_used'] = f"{monitor_values['vram_used']} {mem_usage_unit}"
monitor_values['vram_free'] = f"{monitor_values['vram_free']} {mem_usage_unit}"
monitor_values['vram_total'] = f"{monitor_values['vram_total']} {mem_usage_unit}"
monitor_values['vram_percent'] = f"{monitor_values['vram_percent']} {mem_percent_unit}"
if self.logger.is_json_format():
monitor_values['vram_used'] = {"value" : monitor_values['vram_used'],
"unit" : vram_usage_unit}
"unit" : mem_usage_unit}
monitor_values['vram_free'] = {"value" : monitor_values['vram_free'],
"unit" : vram_usage_unit}
"unit" : mem_usage_unit}
monitor_values['vram_total'] = {"value" : monitor_values['vram_total'],
"unit" : vram_usage_unit}
"unit" : mem_usage_unit}
monitor_values['vram_percent'] = {"value" : monitor_values['vram_percent'],
"unit" : vram_percent_unit}
"unit" : mem_percent_unit}
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['vram_used'] = "N/A"
monitor_values['vram_free'] = "N/A"
monitor_values['vram_total'] = "N/A"
monitor_values['vram_percent'] = "N/A"
logging.debug("Failed to get vram memory usage on gpu %s | %s", gpu_id, e.get_error_info())
logging.debug("Failed to get %s memory usage on gpu %s | %s", mem_type_name.lower(), gpu_id, e.get_error_info())
self.logger.table_header += 'VRAM_USED'.rjust(11)
self.logger.table_header += 'VRAM_FREE'.rjust(12)
self.logger.table_header += 'VRAM_TOTAL'.rjust(12)
self.logger.table_header += 'VRAM%'.rjust(9)
# Use appropriate headers based on memory type
self.logger.table_header += f'{mem_type_name}_USED'.rjust(11)
self.logger.table_header += f'{mem_type_name}_FREE'.rjust(12)
self.logger.table_header += f'{mem_type_name}_TOTAL'.rjust(12)
self.logger.table_header += f'{mem_type_name}%'.rjust(9)
if args.vram_usage and args.default_output:
mem_type, mem_type_name = self.helpers.get_apu_memory_type_and_name(args.gpu, gpu_id)
try:
vram_used = amdsmi_interface.amdsmi_get_gpu_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.VRAM) // (1024*1024)
vram_total = amdsmi_interface.amdsmi_get_gpu_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.VRAM) // (1024*1024)
vram_usage_unit = "GB"
mem_used = amdsmi_interface.amdsmi_get_gpu_memory_usage(args.gpu, mem_type) // (1024*1024)
mem_total = amdsmi_interface.amdsmi_get_gpu_memory_total(args.gpu, mem_type) // (1024*1024)
mem_usage_unit = "GB"
if self.logger.is_json_format():
monitor_values['vram_used'] = {"value" : round(vram_used/1024,1),
"unit" : vram_usage_unit}
monitor_values['vram_total'] = {"value" : round(vram_total/1024,1),
"unit" : vram_usage_unit}
monitor_values['vram_used'] = {"value" : round(mem_used/1024,1),
"unit" : mem_usage_unit}
monitor_values['vram_total'] = {"value" : round(mem_total/1024,1),
"unit" : mem_usage_unit}
elif self.logger.is_csv_format():
monitor_values['vram_used'] = round(vram_used/1024,1)
monitor_values['vram_total'] = round(vram_total/1024,1)
monitor_values['vram_used'] = round(mem_used/1024,1)
monitor_values['vram_total'] = round(mem_total/1024,1)
else:
monitor_values['vram_usage'] = f"{vram_used/1024:5.1f}/{vram_total/1024:5.1f} {vram_usage_unit}".rjust(16,' ')
monitor_values['vram_usage'] = f"{mem_used/1024:5.1f}/{mem_total/1024:5.1f} {mem_usage_unit}".rjust(16,' ')
except amdsmi_exception.AmdSmiLibraryException as e:
if self.logger.is_json_format():
monitor_values['vram_used'] = "N/A"
monitor_values['vram_total'] = "N/A"
else:
monitor_values['vram_usage'] = "N/A"
logging.debug("Failed to get vram memory usage on gpu %s | %s", gpu_id, e.get_error_info())
logging.debug("Failed to get %s memory usage on gpu %s | %s", mem_type_name.lower(), gpu_id, e.get_error_info())
self.logger.table_header += 'VRAM_USAGE'.rjust(16)
# Use appropriate header based on memory type
header_name = f'{mem_type_name}_USAGE'
self.logger.table_header += header_name.rjust(16)
if args.pcie:
if pcie_info != "N/A":
@@ -7518,11 +7525,20 @@ class AMDSMICommands():
power_usage = "N/A"
gpu_info_dict.update({"power_usage": power_usage})
# memory usage
# memory usage - Use APU-aware memory selection
try:
total_vram = amdsmi_interface.amdsmi_get_gpu_memory_total(processor, amdsmi_interface.AmdSmiMemoryType.VRAM) // (1024*1024)
used_vram = amdsmi_interface.amdsmi_get_gpu_memory_usage(processor, amdsmi_interface.AmdSmiMemoryType.VRAM) // (1024*1024)
mem_usage = {"used_vram": used_vram, "total_vram": total_vram}
# Use helper method to determine appropriate memory type
mem_type, mem_type_name = self.helpers.get_apu_memory_type_and_name(processor, gpu_id)
# Get memory usage and total using the determined memory type
used_mem = amdsmi_interface.amdsmi_get_gpu_memory_usage(processor, mem_type) // (1024*1024)
total_mem = amdsmi_interface.amdsmi_get_gpu_memory_total(processor, mem_type) // (1024*1024)
# Create appropriate dictionary keys based on memory type
if mem_type_name == "GTT":
mem_usage = {"used_gtt": used_mem, "total_gtt": total_mem}
else:
mem_usage = {"used_vram": used_mem, "total_vram": total_mem}
except amdsmi_exception.AmdSmiLibraryException as e:
mem_usage = "N/A"
gpu_info_dict.update({"mem_usage": mem_usage})
+65 -20
Просмотреть файл
@@ -762,6 +762,51 @@ class AMDSMIHelpers():
return gpu_bdfs
def get_apu_memory_type_and_name(self, device_handle, gpu_id=None):
"""Determine the appropriate memory type for APU devices
For APU devices, compare VRAM and GTT totals and return the larger one.
For discrete GPUs, return VRAM.
Args:
device_handle: GPU device handle
gpu_id: Optional GPU ID for logging purposes
Returns:
tuple: (memory_type, memory_type_name) where memory_type is AmdSmiMemoryType enum
and memory_type_name is string ("VRAM" or "GTT")
"""
# Default to VRAM
mem_type = amdsmi_interface.AmdSmiMemoryType.VRAM
mem_type_name = "VRAM"
if gpu_id is None:
try:
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
except:
gpu_id = "unknown"
try:
# Check ASIC info flags to see if it's an APU (AMDGPU_IDS_FLAGS_FUSION = 0x1)
asic_info = amdsmi_interface.amdsmi_get_gpu_asic_info(device_handle)
if 'flags' in asic_info and (asic_info['flags'] & 0x1):
# For APUs, compare VRAM and GTT totals and use the larger one
try:
vram_total_check = amdsmi_interface.amdsmi_get_gpu_memory_total(device_handle, amdsmi_interface.AmdSmiMemoryType.VRAM) // (1024*1024)
gtt_total_check = amdsmi_interface.amdsmi_get_gpu_memory_total(device_handle, amdsmi_interface.AmdSmiMemoryType.GTT) // (1024*1024)
if gtt_total_check > vram_total_check:
mem_type = amdsmi_interface.AmdSmiMemoryType.GTT
mem_type_name = "GTT"
logging.debug("APU detected for gpu %s, using %s (VRAM: %d MB, GTT: %d MB)", gpu_id, mem_type_name, vram_total_check, gtt_total_check)
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to compare memory types for APU gpu %s, defaulting to VRAM | %s", gpu_id, e.get_error_info())
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get ASIC info for gpu %s, defaulting to VRAM | %s", gpu_id, e.get_error_info())
return mem_type, mem_type_name
def is_amd_device(self, device_handle):
""" Return whether the specified device is an AMD device or not
@@ -1215,17 +1260,17 @@ class AMDSMIHelpers():
@lru_cache(maxsize=128)
def _cached_group_name(self, gid: int) -> str:
try:
try:
return grp.getgrgid(gid).gr_name
except Exception:
except Exception:
# In containers, the UID may not resolve to a name
return str(gid)
@lru_cache(maxsize=128)
def _cached_user_name(self, uid: int) -> str:
try:
try:
return pwd.getpwuid(uid).pw_name
except Exception:
except Exception:
# In containers, the GID may not resolve to a name
return str(uid)
@@ -1286,11 +1331,11 @@ class AMDSMIHelpers():
"""
Check if the current user can access kfd and dri
Specifically, only care for EACCES/EPERM
Args:
check_render (bool): Whether to check /dev/kfd & /dev/dri/renderD* devices. Defaults to True.
check_video (bool): Whether to check /dev/dri/card* devices. Defaults to True.
Returns:
bool: True if all checked devices are accessible, False if any permission errors found
"""
@@ -1300,7 +1345,7 @@ class AMDSMIHelpers():
return True
paths_to_check = []
# Only add paths for device types that are flagged for checking
if check_render and os.path.exists("/dev/kfd"):
paths_to_check.append("/dev/kfd")
@@ -1319,7 +1364,7 @@ class AMDSMIHelpers():
# Do not try to open all paths, may cause driver issues.
# Read access is sufficient to check permissions.
#
# Reason: GPUs which support partitioning (memory/compute),
# Reason: GPUs which support partitioning (memory/compute),
# logical devices will not be valid until configured.
# See `sudo amd-smi set -h` or applicable APIs
# to configure on supported hardware.
@@ -1565,14 +1610,14 @@ class AMDSMIHelpers():
error_severity = entry.get("error_severity", "").lower()
notify_type = entry.get("notify_type", "")
prefix = self._severity_as_string(error_severity, notify_type, True)
# Generate filenames
count = self.get_cper_count() + 1
cper_name = f"{prefix}-{count}.cper"
json_name = f"{prefix}-{count}.json"
cper_path = folder / cper_name
json_path = folder / json_name
# Write CPER binary file
try:
self.write_binary(
@@ -1582,7 +1627,7 @@ class AMDSMIHelpers():
)
except Exception as e:
logging.debug(f"Failed to write CPER file {cper_path}: {e}")
# Write JSON metadata file
try:
with json_path.open("w") as cper_json_file:
@@ -1594,7 +1639,7 @@ class AMDSMIHelpers():
)
except Exception as e:
logging.debug(f"Failed to write JSON file {json_path}: {e}")
# Collect data for printing
timestamp = entry.get("timestamp", "unknown")
gpu_id = self.get_gpu_id_from_device_handle(device_handle)
@@ -1980,13 +2025,13 @@ class AMDSMIHelpers():
"""
Helper method to compute metric version, partition ID, and num_partition for dynamic metrics.
Handles logging updates internally for reusability.
Args:
gpu_metrics_info (dict): GPU metrics info from amdsmi_get_gpu_metrics_info.
is_partition_metrics (bool): Whether this is for partition metrics.
gpu_id (int): GPU ID for logging.
gpu_handle: GPU device handle for KFD info retrieval.
Returns:
dict: {
'metric_version': float or "N/A",
@@ -2004,7 +2049,7 @@ class AMDSMIHelpers():
metric_version = float(f"{format_rev}.{content_rev}")
except ValueError:
metric_version = "N/A" # Fallback if conversion fails
# Retrieve partition ID from KFD info
partition_id = "N/A"
try:
@@ -2012,7 +2057,7 @@ class AMDSMIHelpers():
partition_id = kfd_info.get('current_partition_id', "N/A")
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get current partition ID for GPU %s | %s", gpu_id, e.get_error_info())
# Determine num_partition with fallback logic for dynamic metrics
num_partition = gpu_metrics_info.get('num_partition', "N/A")
if metric_version != "N/A" and num_partition == "N/A":
@@ -2026,22 +2071,22 @@ class AMDSMIHelpers():
# Fallback to partition_id if partitions exist but num_partition is unavailable
num_partition = partition_id
# Else: Remains "N/A" if no conditions match
# Alias num_xcp for XCP metrics usage
num_xcp = num_partition
# Debug logging
logging.debug(
"GPU %s | Metric version: %s, num_partition: %s, partition_id: %s, num_xcp: %s",
gpu_id, metric_version, num_partition, partition_id, num_xcp
)
return {
'metric_version': metric_version,
'partition_id': partition_id,
'num_partition': num_partition,
'num_xcp': num_xcp
}
}
def get_gpu_board_temperatures(self, device_handle, gpu_id, logger):
"""Get GPU board temperature readings
+10 -1
Просмотреть файл
@@ -1105,7 +1105,16 @@ class AMDSMILogger():
mem_usage = gpu_info['mem_usage']
if mem_usage != "N/A":
mem_usage = f"{gpu_info['mem_usage']['used_vram']}/{gpu_info['mem_usage']['total_vram']} MB"
# Support both VRAM and GTT memory types for APU-aware display
if 'used_gtt' in mem_usage and 'total_gtt' in mem_usage:
# GTT memory selected (likely APU)
mem_usage = f"{gpu_info['mem_usage']['used_gtt']}/{gpu_info['mem_usage']['total_gtt']} MB"
elif 'used_vram' in mem_usage and 'total_vram' in mem_usage:
# VRAM memory selected (standard or APU with more VRAM)
mem_usage = f"{gpu_info['mem_usage']['used_vram']}/{gpu_info['mem_usage']['total_vram']} MB"
else:
# Fallback if neither format is found
mem_usage = "N/A"
mem_usage = mem_usage.rjust(21)
print("| {0:12.12s} {1:22.22s} | {2:5.5s} {3:6.6s} {4:5.5s} {5:13.13s} |".format(bdf, market_name, mem_util, temp, u_ecc, power_usage))
+2 -1
Просмотреть файл
@@ -927,7 +927,8 @@ typedef struct {
uint32_t num_of_compute_units; //!< 0xFFFFFFFF if not supported
uint64_t target_graphics_version; //!< 0xFFFFFFFFFFFFFFFF if not supported
uint32_t subsystem_id; //!> The subsystem ID
uint32_t reserved[21];
uint64_t flags; //!< Chip flags
uint32_t reserved[19];
} amdsmi_asic_info_t;
/**
+2 -1
Просмотреть файл
@@ -2123,7 +2123,8 @@ def amdsmi_get_gpu_asic_info(
"oam_id": _validate_if_max_uint(asic_info_struct.oam_id, MaxUIntegerTypes.UINT32_T),
"num_compute_units": _validate_if_max_uint(asic_info_struct.num_of_compute_units, MaxUIntegerTypes.UINT32_T),
"target_graphics_version": "gfx" + target_graphics_version,
"subsystem_id": subsystem_id
"subsystem_id": subsystem_id,
"flags": asic_info_struct.flags
}
string_values = ["market_name", "vendor_name"]
+4 -1
Просмотреть файл
@@ -1137,7 +1137,10 @@ struct_amdsmi_asic_info_t._fields_ = [
('PADDING_0', ctypes.c_ubyte * 4),
('target_graphics_version', ctypes.c_uint64),
('subsystem_id', ctypes.c_uint32),
('reserved', ctypes.c_uint32 * 21),
('PADDING_1', ctypes.c_ubyte * 4),
('flags', ctypes.c_uint64),
('reserved', ctypes.c_uint32 * 19),
('PADDING_2', ctypes.c_ubyte * 4),
]
amdsmi_asic_info_t = struct_amdsmi_asic_info_t
+2 -1
Просмотреть файл
@@ -1156,7 +1156,8 @@ pub struct AmdsmiAsicInfoT {
pub num_of_compute_units: u32,
pub target_graphics_version: u64,
pub subsystem_id: u32,
pub reserved: [u32; 21usize],
pub flags: u64,
pub reserved: [u32; 19usize],
}
#[allow(clippy::unnecessary_operation, clippy::identity_op)]
const _: () = {
+2
Просмотреть файл
@@ -1716,6 +1716,7 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i
info->num_of_compute_units = std::numeric_limits<uint32_t>::max();
info->target_graphics_version = std::numeric_limits<uint64_t>::max();
info->subsystem_id = std::numeric_limits<uint32_t>::max();
info->flags = 0;
std::ostringstream ss;
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
@@ -1921,6 +1922,7 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i
}
// TODO(cpoag): check if this is correct, might be able to go through KGD/KFD
info->rev_id = static_cast<uint32_t>(dev_info.pci_rev);
info->flags = static_cast<uint64_t>(dev_info.ids_flags);
libdrm.unload();
ss << __PRETTY_FUNCTION__