diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 1e13f524d8..302c4a20fc 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -6159,69 +6159,76 @@ class AMDSMICommands(): self.logger.table_header += 'PCIE_REPLAY'.rjust(13) if args.vram_usage and not args.default_output: + mem_type, mem_type_name = self.helpers.get_apu_memory_type_and_name(args.gpu, gpu_id) + try: - vram_used = amdsmi_interface.amdsmi_get_gpu_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.VRAM) // (1024*1024) - vram_total = amdsmi_interface.amdsmi_get_gpu_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.VRAM) // (1024*1024) - monitor_values['vram_used'] = vram_used - monitor_values['vram_free'] = vram_total - vram_used - monitor_values['vram_total'] = vram_total - if vram_total != 0: - monitor_values['vram_percent'] = round ((vram_used / vram_total) * 100, 2) + mem_used = amdsmi_interface.amdsmi_get_gpu_memory_usage(args.gpu, mem_type) // (1024*1024) + mem_total = amdsmi_interface.amdsmi_get_gpu_memory_total(args.gpu, mem_type) // (1024*1024) + monitor_values['vram_used'] = mem_used + monitor_values['vram_free'] = mem_total - mem_used + monitor_values['vram_total'] = mem_total + if mem_total != 0: + monitor_values['vram_percent'] = round ((mem_used / mem_total) * 100, 2) else: monitor_values['vram_percent'] = "N/A" - vram_usage_unit = "MB" - vram_percent_unit = "%" + mem_usage_unit = "MB" + mem_percent_unit = "%" if self.logger.is_human_readable_format(): - monitor_values['vram_used'] = f"{monitor_values['vram_used']} {vram_usage_unit}" - monitor_values['vram_free'] = f"{monitor_values['vram_free']} {vram_usage_unit}" - monitor_values['vram_total'] = f"{monitor_values['vram_total']} {vram_usage_unit}" - monitor_values['vram_percent'] = f"{monitor_values['vram_percent']} {vram_percent_unit}" + monitor_values['vram_used'] = f"{monitor_values['vram_used']} {mem_usage_unit}" + monitor_values['vram_free'] = f"{monitor_values['vram_free']} {mem_usage_unit}" + monitor_values['vram_total'] = f"{monitor_values['vram_total']} {mem_usage_unit}" + monitor_values['vram_percent'] = f"{monitor_values['vram_percent']} {mem_percent_unit}" if self.logger.is_json_format(): monitor_values['vram_used'] = {"value" : monitor_values['vram_used'], - "unit" : vram_usage_unit} + "unit" : mem_usage_unit} monitor_values['vram_free'] = {"value" : monitor_values['vram_free'], - "unit" : vram_usage_unit} + "unit" : mem_usage_unit} monitor_values['vram_total'] = {"value" : monitor_values['vram_total'], - "unit" : vram_usage_unit} + "unit" : mem_usage_unit} monitor_values['vram_percent'] = {"value" : monitor_values['vram_percent'], - "unit" : vram_percent_unit} + "unit" : mem_percent_unit} except amdsmi_exception.AmdSmiLibraryException as e: monitor_values['vram_used'] = "N/A" monitor_values['vram_free'] = "N/A" monitor_values['vram_total'] = "N/A" monitor_values['vram_percent'] = "N/A" - logging.debug("Failed to get vram memory usage on gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get %s memory usage on gpu %s | %s", mem_type_name.lower(), gpu_id, e.get_error_info()) - self.logger.table_header += 'VRAM_USED'.rjust(11) - self.logger.table_header += 'VRAM_FREE'.rjust(12) - self.logger.table_header += 'VRAM_TOTAL'.rjust(12) - self.logger.table_header += 'VRAM%'.rjust(9) + # Use appropriate headers based on memory type + self.logger.table_header += f'{mem_type_name}_USED'.rjust(11) + self.logger.table_header += f'{mem_type_name}_FREE'.rjust(12) + self.logger.table_header += f'{mem_type_name}_TOTAL'.rjust(12) + self.logger.table_header += f'{mem_type_name}%'.rjust(9) if args.vram_usage and args.default_output: + mem_type, mem_type_name = self.helpers.get_apu_memory_type_and_name(args.gpu, gpu_id) + try: - vram_used = amdsmi_interface.amdsmi_get_gpu_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.VRAM) // (1024*1024) - vram_total = amdsmi_interface.amdsmi_get_gpu_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.VRAM) // (1024*1024) - vram_usage_unit = "GB" + mem_used = amdsmi_interface.amdsmi_get_gpu_memory_usage(args.gpu, mem_type) // (1024*1024) + mem_total = amdsmi_interface.amdsmi_get_gpu_memory_total(args.gpu, mem_type) // (1024*1024) + mem_usage_unit = "GB" if self.logger.is_json_format(): - monitor_values['vram_used'] = {"value" : round(vram_used/1024,1), - "unit" : vram_usage_unit} - monitor_values['vram_total'] = {"value" : round(vram_total/1024,1), - "unit" : vram_usage_unit} + monitor_values['vram_used'] = {"value" : round(mem_used/1024,1), + "unit" : mem_usage_unit} + monitor_values['vram_total'] = {"value" : round(mem_total/1024,1), + "unit" : mem_usage_unit} elif self.logger.is_csv_format(): - monitor_values['vram_used'] = round(vram_used/1024,1) - monitor_values['vram_total'] = round(vram_total/1024,1) + monitor_values['vram_used'] = round(mem_used/1024,1) + monitor_values['vram_total'] = round(mem_total/1024,1) else: - monitor_values['vram_usage'] = f"{vram_used/1024:5.1f}/{vram_total/1024:5.1f} {vram_usage_unit}".rjust(16,' ') + monitor_values['vram_usage'] = f"{mem_used/1024:5.1f}/{mem_total/1024:5.1f} {mem_usage_unit}".rjust(16,' ') except amdsmi_exception.AmdSmiLibraryException as e: if self.logger.is_json_format(): monitor_values['vram_used'] = "N/A" monitor_values['vram_total'] = "N/A" else: monitor_values['vram_usage'] = "N/A" - logging.debug("Failed to get vram memory usage on gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get %s memory usage on gpu %s | %s", mem_type_name.lower(), gpu_id, e.get_error_info()) - self.logger.table_header += 'VRAM_USAGE'.rjust(16) + # Use appropriate header based on memory type + header_name = f'{mem_type_name}_USAGE' + self.logger.table_header += header_name.rjust(16) if args.pcie: if pcie_info != "N/A": @@ -7518,11 +7525,20 @@ class AMDSMICommands(): power_usage = "N/A" gpu_info_dict.update({"power_usage": power_usage}) - # memory usage + # memory usage - Use APU-aware memory selection try: - total_vram = amdsmi_interface.amdsmi_get_gpu_memory_total(processor, amdsmi_interface.AmdSmiMemoryType.VRAM) // (1024*1024) - used_vram = amdsmi_interface.amdsmi_get_gpu_memory_usage(processor, amdsmi_interface.AmdSmiMemoryType.VRAM) // (1024*1024) - mem_usage = {"used_vram": used_vram, "total_vram": total_vram} + # Use helper method to determine appropriate memory type + mem_type, mem_type_name = self.helpers.get_apu_memory_type_and_name(processor, gpu_id) + + # Get memory usage and total using the determined memory type + used_mem = amdsmi_interface.amdsmi_get_gpu_memory_usage(processor, mem_type) // (1024*1024) + total_mem = amdsmi_interface.amdsmi_get_gpu_memory_total(processor, mem_type) // (1024*1024) + + # Create appropriate dictionary keys based on memory type + if mem_type_name == "GTT": + mem_usage = {"used_gtt": used_mem, "total_gtt": total_mem} + else: + mem_usage = {"used_vram": used_mem, "total_vram": total_mem} except amdsmi_exception.AmdSmiLibraryException as e: mem_usage = "N/A" gpu_info_dict.update({"mem_usage": mem_usage}) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py index 386cf0a37d..9f46c0caf4 100755 --- a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py @@ -762,6 +762,51 @@ class AMDSMIHelpers(): return gpu_bdfs + def get_apu_memory_type_and_name(self, device_handle, gpu_id=None): + """Determine the appropriate memory type for APU devices + + For APU devices, compare VRAM and GTT totals and return the larger one. + For discrete GPUs, return VRAM. + + Args: + device_handle: GPU device handle + gpu_id: Optional GPU ID for logging purposes + + Returns: + tuple: (memory_type, memory_type_name) where memory_type is AmdSmiMemoryType enum + and memory_type_name is string ("VRAM" or "GTT") + """ + # Default to VRAM + mem_type = amdsmi_interface.AmdSmiMemoryType.VRAM + mem_type_name = "VRAM" + + if gpu_id is None: + try: + gpu_id = self.get_gpu_id_from_device_handle(device_handle) + except: + gpu_id = "unknown" + + try: + # Check ASIC info flags to see if it's an APU (AMDGPU_IDS_FLAGS_FUSION = 0x1) + asic_info = amdsmi_interface.amdsmi_get_gpu_asic_info(device_handle) + if 'flags' in asic_info and (asic_info['flags'] & 0x1): + # For APUs, compare VRAM and GTT totals and use the larger one + try: + vram_total_check = amdsmi_interface.amdsmi_get_gpu_memory_total(device_handle, amdsmi_interface.AmdSmiMemoryType.VRAM) // (1024*1024) + gtt_total_check = amdsmi_interface.amdsmi_get_gpu_memory_total(device_handle, amdsmi_interface.AmdSmiMemoryType.GTT) // (1024*1024) + + if gtt_total_check > vram_total_check: + mem_type = amdsmi_interface.AmdSmiMemoryType.GTT + mem_type_name = "GTT" + logging.debug("APU detected for gpu %s, using %s (VRAM: %d MB, GTT: %d MB)", gpu_id, mem_type_name, vram_total_check, gtt_total_check) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to compare memory types for APU gpu %s, defaulting to VRAM | %s", gpu_id, e.get_error_info()) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get ASIC info for gpu %s, defaulting to VRAM | %s", gpu_id, e.get_error_info()) + + return mem_type, mem_type_name + + def is_amd_device(self, device_handle): """ Return whether the specified device is an AMD device or not @@ -1215,17 +1260,17 @@ class AMDSMIHelpers(): @lru_cache(maxsize=128) def _cached_group_name(self, gid: int) -> str: - try: + try: return grp.getgrgid(gid).gr_name - except Exception: + except Exception: # In containers, the UID may not resolve to a name return str(gid) @lru_cache(maxsize=128) def _cached_user_name(self, uid: int) -> str: - try: + try: return pwd.getpwuid(uid).pw_name - except Exception: + except Exception: # In containers, the GID may not resolve to a name return str(uid) @@ -1286,11 +1331,11 @@ class AMDSMIHelpers(): """ Check if the current user can access kfd and dri Specifically, only care for EACCES/EPERM - + Args: check_render (bool): Whether to check /dev/kfd & /dev/dri/renderD* devices. Defaults to True. check_video (bool): Whether to check /dev/dri/card* devices. Defaults to True. - + Returns: bool: True if all checked devices are accessible, False if any permission errors found """ @@ -1300,7 +1345,7 @@ class AMDSMIHelpers(): return True paths_to_check = [] - + # Only add paths for device types that are flagged for checking if check_render and os.path.exists("/dev/kfd"): paths_to_check.append("/dev/kfd") @@ -1319,7 +1364,7 @@ class AMDSMIHelpers(): # Do not try to open all paths, may cause driver issues. # Read access is sufficient to check permissions. # - # Reason: GPUs which support partitioning (memory/compute), + # Reason: GPUs which support partitioning (memory/compute), # logical devices will not be valid until configured. # See `sudo amd-smi set -h` or applicable APIs # to configure on supported hardware. @@ -1565,14 +1610,14 @@ class AMDSMIHelpers(): error_severity = entry.get("error_severity", "").lower() notify_type = entry.get("notify_type", "") prefix = self._severity_as_string(error_severity, notify_type, True) - + # Generate filenames count = self.get_cper_count() + 1 cper_name = f"{prefix}-{count}.cper" json_name = f"{prefix}-{count}.json" cper_path = folder / cper_name json_path = folder / json_name - + # Write CPER binary file try: self.write_binary( @@ -1582,7 +1627,7 @@ class AMDSMIHelpers(): ) except Exception as e: logging.debug(f"Failed to write CPER file {cper_path}: {e}") - + # Write JSON metadata file try: with json_path.open("w") as cper_json_file: @@ -1594,7 +1639,7 @@ class AMDSMIHelpers(): ) except Exception as e: logging.debug(f"Failed to write JSON file {json_path}: {e}") - + # Collect data for printing timestamp = entry.get("timestamp", "unknown") gpu_id = self.get_gpu_id_from_device_handle(device_handle) @@ -1980,13 +2025,13 @@ class AMDSMIHelpers(): """ Helper method to compute metric version, partition ID, and num_partition for dynamic metrics. Handles logging updates internally for reusability. - + Args: gpu_metrics_info (dict): GPU metrics info from amdsmi_get_gpu_metrics_info. is_partition_metrics (bool): Whether this is for partition metrics. gpu_id (int): GPU ID for logging. gpu_handle: GPU device handle for KFD info retrieval. - + Returns: dict: { 'metric_version': float or "N/A", @@ -2004,7 +2049,7 @@ class AMDSMIHelpers(): metric_version = float(f"{format_rev}.{content_rev}") except ValueError: metric_version = "N/A" # Fallback if conversion fails - + # Retrieve partition ID from KFD info partition_id = "N/A" try: @@ -2012,7 +2057,7 @@ class AMDSMIHelpers(): partition_id = kfd_info.get('current_partition_id', "N/A") except amdsmi_exception.AmdSmiLibraryException as e: logging.debug("Failed to get current partition ID for GPU %s | %s", gpu_id, e.get_error_info()) - + # Determine num_partition with fallback logic for dynamic metrics num_partition = gpu_metrics_info.get('num_partition', "N/A") if metric_version != "N/A" and num_partition == "N/A": @@ -2026,22 +2071,22 @@ class AMDSMIHelpers(): # Fallback to partition_id if partitions exist but num_partition is unavailable num_partition = partition_id # Else: Remains "N/A" if no conditions match - + # Alias num_xcp for XCP metrics usage num_xcp = num_partition - + # Debug logging logging.debug( "GPU %s | Metric version: %s, num_partition: %s, partition_id: %s, num_xcp: %s", gpu_id, metric_version, num_partition, partition_id, num_xcp ) - + return { 'metric_version': metric_version, 'partition_id': partition_id, 'num_partition': num_partition, 'num_xcp': num_xcp - } + } def get_gpu_board_temperatures(self, device_handle, gpu_id, logger): """Get GPU board temperature readings diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py index b28214a7f0..3448bef6af 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py @@ -1105,7 +1105,16 @@ class AMDSMILogger(): mem_usage = gpu_info['mem_usage'] if mem_usage != "N/A": - mem_usage = f"{gpu_info['mem_usage']['used_vram']}/{gpu_info['mem_usage']['total_vram']} MB" + # Support both VRAM and GTT memory types for APU-aware display + if 'used_gtt' in mem_usage and 'total_gtt' in mem_usage: + # GTT memory selected (likely APU) + mem_usage = f"{gpu_info['mem_usage']['used_gtt']}/{gpu_info['mem_usage']['total_gtt']} MB" + elif 'used_vram' in mem_usage and 'total_vram' in mem_usage: + # VRAM memory selected (standard or APU with more VRAM) + mem_usage = f"{gpu_info['mem_usage']['used_vram']}/{gpu_info['mem_usage']['total_vram']} MB" + else: + # Fallback if neither format is found + mem_usage = "N/A" mem_usage = mem_usage.rjust(21) print("| {0:12.12s} {1:22.22s} | {2:5.5s} {3:6.6s} {4:5.5s} {5:13.13s} |".format(bdf, market_name, mem_util, temp, u_ecc, power_usage)) diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index eb9609be60..8061f12b68 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -927,7 +927,8 @@ typedef struct { uint32_t num_of_compute_units; //!< 0xFFFFFFFF if not supported uint64_t target_graphics_version; //!< 0xFFFFFFFFFFFFFFFF if not supported uint32_t subsystem_id; //!> The subsystem ID - uint32_t reserved[21]; + uint64_t flags; //!< Chip flags + uint32_t reserved[19]; } amdsmi_asic_info_t; /** diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index ee93a0688e..babb9990c5 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -2123,7 +2123,8 @@ def amdsmi_get_gpu_asic_info( "oam_id": _validate_if_max_uint(asic_info_struct.oam_id, MaxUIntegerTypes.UINT32_T), "num_compute_units": _validate_if_max_uint(asic_info_struct.num_of_compute_units, MaxUIntegerTypes.UINT32_T), "target_graphics_version": "gfx" + target_graphics_version, - "subsystem_id": subsystem_id + "subsystem_id": subsystem_id, + "flags": asic_info_struct.flags } string_values = ["market_name", "vendor_name"] diff --git a/projects/amdsmi/py-interface/amdsmi_wrapper.py b/projects/amdsmi/py-interface/amdsmi_wrapper.py index e0e88a063f..d06a02f4a0 100644 --- a/projects/amdsmi/py-interface/amdsmi_wrapper.py +++ b/projects/amdsmi/py-interface/amdsmi_wrapper.py @@ -1137,7 +1137,10 @@ struct_amdsmi_asic_info_t._fields_ = [ ('PADDING_0', ctypes.c_ubyte * 4), ('target_graphics_version', ctypes.c_uint64), ('subsystem_id', ctypes.c_uint32), - ('reserved', ctypes.c_uint32 * 21), + ('PADDING_1', ctypes.c_ubyte * 4), + ('flags', ctypes.c_uint64), + ('reserved', ctypes.c_uint32 * 19), + ('PADDING_2', ctypes.c_ubyte * 4), ] amdsmi_asic_info_t = struct_amdsmi_asic_info_t diff --git a/projects/amdsmi/rust-interface/src/amdsmi_wrapper.rs b/projects/amdsmi/rust-interface/src/amdsmi_wrapper.rs index 2a733d00e8..f8a69dc1ef 100644 --- a/projects/amdsmi/rust-interface/src/amdsmi_wrapper.rs +++ b/projects/amdsmi/rust-interface/src/amdsmi_wrapper.rs @@ -1156,7 +1156,8 @@ pub struct AmdsmiAsicInfoT { pub num_of_compute_units: u32, pub target_graphics_version: u64, pub subsystem_id: u32, - pub reserved: [u32; 21usize], + pub flags: u64, + pub reserved: [u32; 19usize], } #[allow(clippy::unnecessary_operation, clippy::identity_op)] const _: () = { diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index 3f832108b7..0bab87e9a9 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -1716,6 +1716,7 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i info->num_of_compute_units = std::numeric_limits::max(); info->target_graphics_version = std::numeric_limits::max(); info->subsystem_id = std::numeric_limits::max(); + info->flags = 0; std::ostringstream ss; amd::smi::AMDSmiGPUDevice* gpu_device = nullptr; @@ -1921,6 +1922,7 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i } // TODO(cpoag): check if this is correct, might be able to go through KGD/KFD info->rev_id = static_cast(dev_info.pci_rev); + info->flags = static_cast(dev_info.ids_flags); libdrm.unload(); ss << __PRETTY_FUNCTION__