diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 5ee21b5bad..84f1cbf43e 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -548,6 +548,7 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: power_limit_error = True max_power_limit = "N/A" + min_power_limit = "N/A" socket_power_limit = "N/A" logging.debug("Failed to get power cap info for gpu %s | %s", gpu_id, e.get_error_info()) @@ -1517,7 +1518,7 @@ class AMDSMICommands(): gpu_metric_version_str = json.dumps(gpu_metric_version_info, indent=4) logging.debug("GPU Metrics table Version for GPU %s | %s", gpu_id, gpu_metric_version_str) except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Unable to load GPU Metrics table version for GPU %s | %s", gpu_id, e.err_info) + logging.debug("#1 - Unable to load GPU Metrics table version for %s | %s", gpu_id, e.err_info) try: # Get GPU Metrics table @@ -1525,7 +1526,7 @@ class AMDSMICommands(): gpu_metric_str = json.dumps(gpu_metric_debug_info, indent=4) logging.debug("GPU Metrics table for GPU %s | %s", gpu_id, str(gpu_metric_str)) except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Unable to load GPU Metrics table for %s | %s", gpu_id, e.err_info) + logging.debug("#2 - Unable to load GPU Metrics table for %s | %s", gpu_id, e.err_info) logging.debug(f"Metric Arg information for GPU {gpu_id} on {self.helpers.os_info()}") logging.debug(f"Args: {current_platform_args}") @@ -1544,7 +1545,85 @@ class AMDSMICommands(): # Get GPU Metrics table gpu_metric = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Unable to load GPU Metrics table for %s | %s", gpu_id, e.err_info) + logging.debug("#3 - Unable to load GPU Metrics table for %s | %s", gpu_id, e.err_info) + gpu_metric = { + "temperature_edge": "N/A", + "temperature_hotspot": "N/A", + "temperature_mem": "N/A", + "temperature_vrgfx": "N/A", + "temperature_vrsoc": "N/A", + "temperature_vrmem": "N/A", + "average_gfx_activity": "N/A", + "average_umc_activity": "N/A", + "average_mm_activity": "N/A", + "average_socket_power": "N/A", + "energy_accumulator": "N/A", + "system_clock_counter": "N/A", + "average_gfxclk_frequency": "N/A", + "average_socclk_frequency": "N/A", + "average_uclk_frequency": "N/A", + "average_vclk0_frequency": "N/A", + "average_dclk0_frequency": "N/A", + "average_vclk1_frequency": "N/A", + "average_dclk1_frequency": "N/A", + "current_gfxclk": "N/A", + "current_socclk": "N/A", + "current_uclk": "N/A", + "current_vclk0": "N/A", + "current_dclk0": "N/A", + "current_vclk1": "N/A", + "current_dclk1": "N/A", + "throttle_status": "N/A", + "current_fan_speed": "N/A", + "pcie_link_width": "N/A", + "pcie_link_speed": "N/A", + "gfx_activity_acc": "N/A", + "mem_activity_acc": "N/A", + "temperature_hbm": "N/A", + "firmware_timestamp": "N/A", + "voltage_soc": "N/A", + "voltage_gfx": "N/A", + "voltage_mem": "N/A", + "indep_throttle_status": "N/A", + "current_socket_power": "N/A", + "vcn_activity": "N/A", + "gfxclk_lock_status": "N/A", + "xgmi_link_width": "N/A", + "xgmi_link_speed": "N/A", + "pcie_bandwidth_acc": "N/A", + "pcie_bandwidth_inst": "N/A", + "pcie_l0_to_recov_count_acc": "N/A", + "pcie_replay_count_acc": "N/A", + "pcie_replay_rover_count_acc": "N/A", + "xgmi_read_data_acc": "N/A", + "xgmi_write_data_acc": "N/A", + "current_gfxclks": "N/A", + "current_socclks": "N/A", + "current_vclk0s": "N/A", + "current_dclk0s": "N/A", + "jpeg_activity": "N/A", + "pcie_nak_sent_count_acc": "N/A", + "pcie_nak_rcvd_count_acc": "N/A", + "accumulation_counter": "N/A", + "prochot_residency_acc": "N/A", + "ppt_residency_acc": "N/A", + "socket_thm_residency_acc": "N/A", + "vr_thm_residency_acc": "N/A", + "hbm_thm_residency_acc": "N/A", + "num_partition": "N/A", + "xcp_stats.gfx_busy_inst": "N/A", + "xcp_stats.jpeg_busy": "N/A", + "xcp_stats.vcn_busy": "N/A", + "xcp_stats.gfx_busy_acc": "N/A", + "xcp_stats.gfx_below_host_limit_acc": "N/A", + "xcp_stats.gfx_below_host_limit_ppt_acc": "N/A", + "xcp_stats.gfx_below_host_limit_thm_acc": "N/A", + "xcp_stats.gfx_low_utilization_acc": "N/A", + "xcp_stats.gfx_below_host_limit_total_acc": "N/A", + "pcie_lc_perf_other_end_recovery": "N/A", + "vram_max_bandwidth": "N/A", + "xgmi_link_status": "N/A", + } # Populate the pcie_dict first due to multiple gpu metrics calls incorrectly increasing bandwidth if "pcie" in current_platform_args: @@ -1828,24 +1907,35 @@ class AMDSMICommands(): # Populate GFX clock values try: current_gfx_clocks = gpu_metric["current_gfxclks"] - for clock_index, current_gfx_clock in enumerate(current_gfx_clocks): - # If the current clock is N/A then nothing else applies - if current_gfx_clock == "N/A": - continue + if current_gfx_clocks == "N/A": + # If the current gfx clocks are not available, we cannot proceed further + for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_GFX_CLKS): + gfx_index = f"gfx_{clock_index}" + clocks[gfx_index]["clk"] = "N/A" + clocks[gfx_index]["min_clk"] = "N/A" + clocks[gfx_index]["max_clk"] = "N/A" + clocks[gfx_index]["clk_locked"] = "N/A" + clocks[gfx_index]["deep_sleep"] = "N/A" # assume deep sleep if no clocks are available - gfx_index = f"gfx_{clock_index}" - clocks[gfx_index]["clk"] = self.helpers.unit_format(self.logger, - current_gfx_clock, - clock_unit) - - # Populate clock locked status - if gpu_metric["gfxclk_lock_status"] != "N/A": - gfx_clock_lock_flag = 1 << clock_index # This is the position of the clock lock flag - if gpu_metric["gfxclk_lock_status"] & gfx_clock_lock_flag: - clocks[gfx_index]["clk_locked"] = "ENABLED" - else: - clocks[gfx_index]["clk_locked"] = "DISABLED" - except Exception as e: + else: + for clock_index, current_gfx_clock in enumerate(current_gfx_clocks): + # If the current clock is N/A then nothing else applies + if current_gfx_clock == "N/A": + continue + + gfx_index = f"gfx_{clock_index}" + clocks[gfx_index]["clk"] = self.helpers.unit_format(self.logger, + current_gfx_clock, + clock_unit) + + # Populate clock locked status + if gpu_metric["gfxclk_lock_status"] != "N/A": + gfx_clock_lock_flag = 1 << clock_index # This is the position of the clock lock flag + if gpu_metric["gfxclk_lock_status"] & gfx_clock_lock_flag: + clocks[gfx_index]["clk_locked"] = "ENABLED" + else: + clocks[gfx_index]["clk_locked"] = "DISABLED" + except KeyError as e: logging.debug("Failed to get current_gfxclks for gpu %s | %s", gpu_id, e) # Populate MEM clock value @@ -1861,31 +1951,51 @@ class AMDSMICommands(): # Populate VCLK clock values try: current_vclk_clocks = gpu_metric["current_vclk0s"] - for clock_index, current_vclk_clock in enumerate(current_vclk_clocks): - # If the current clock is N/A then nothing else applies - if current_vclk_clock == "N/A": - continue + if current_vclk_clocks == "N/A": + # If the current vclk clocks are not available, we cannot proceed further + for clock_index in range(kMAX_NUM_VCLKS): + vclk_index = f"vclk_{clock_index}" + clocks[vclk_index]["clk"] = "N/A" + clocks[vclk_index]["min_clk"] = "N/A" + clocks[vclk_index]["max_clk"] = "N/A" + clocks[vclk_index]["clk_locked"] = "N/A" + clocks[vclk_index]["deep_sleep"] = "N/A" + else: + for clock_index, current_vclk_clock in enumerate(current_vclk_clocks): + # If the current clock is N/A then nothing else applies + if current_vclk_clock == "N/A": + continue - vclk_index = f"vclk_{clock_index}" - clocks[vclk_index]["clk"] = self.helpers.unit_format(self.logger, - current_vclk_clock, - clock_unit) - except Exception as e: + vclk_index = f"vclk_{clock_index}" + clocks[vclk_index]["clk"] = self.helpers.unit_format(self.logger, + current_vclk_clock, + clock_unit) + except KeyError as e: logging.debug("Failed to get current_vclk0s for gpu %s | %s", gpu_id, e) # Populate DCLK clock values try: current_dclk_clocks = gpu_metric["current_dclk0s"] - for clock_index, current_dclk_clock in enumerate(current_dclk_clocks): - # If the current clock is N/A then nothing else applies - if current_dclk_clock == "N/A": - continue + if current_dclk_clocks == "N/A": + # If the current dclk clocks are not available, we cannot proceed further + for clock_index in range(kMAX_NUM_DCLKS): + dclk_index = f"dclk_{clock_index}" + clocks[dclk_index]["clk"] = "N/A" + clocks[dclk_index]["min_clk"] = "N/A" + clocks[dclk_index]["max_clk"] = "N/A" + clocks[dclk_index]["clk_locked"] = "N/A" + clocks[dclk_index]["deep_sleep"] = "N/A" + else: + for clock_index, current_dclk_clock in enumerate(current_dclk_clocks): + # If the current clock is N/A then nothing else applies + if current_dclk_clock == "N/A": + continue - dclk_index = f"dclk_{clock_index}" - clocks[dclk_index]["clk"] = self.helpers.unit_format(self.logger, - current_dclk_clock, - clock_unit) - except Exception as e: + dclk_index = f"dclk_{clock_index}" + clocks[dclk_index]["clk"] = self.helpers.unit_format(self.logger, + current_dclk_clock, + clock_unit) + except KeyError as e: logging.debug("Failed to get current_dclk0s for gpu %s | %s", gpu_id, e) # Populate FCLK clock value; fclk not present in gpu_metrics so use amdsmi_get_clk_freq @@ -1902,10 +2012,19 @@ class AMDSMICommands(): # Populate SOCCLK clock value try: current_socclk_clock = gpu_metric["current_socclk"] - clocks["socclk_0"]["clk"] = self.helpers.unit_format(self.logger, - current_socclk_clock, - clock_unit) - except Exception as e: + if current_socclk_clock == "N/A": + # If the current socclk clocks are not available, we cannot proceed further + clocks["socclk_0"]["clk"] = "N/A" + clocks["socclk_0"]["min_clk"] = "N/A" + clocks["socclk_0"]["max_clk"] = "N/A" + clocks["socclk_0"]["clk_locked"] = "N/A" + clocks["socclk_0"]["deep_sleep"] = "N/A" + else: + # If the current clock is N/A then nothing else applies + clocks["socclk_0"]["clk"] = self.helpers.unit_format(self.logger, + current_socclk_clock, + clock_unit) + except KeyError as e: logging.debug("Failed to get current_socclk for gpu %s | %s", gpu_id, e) # Populate the max and min clock values from sysfs @@ -1971,17 +2090,19 @@ class AMDSMICommands(): logging.debug("Failed to get vclk1 and/or dclk1 clock info for gpu %s | %s", gpu_id, e.get_error_info()) # if the current clock is N/A then we shouldn't populate the max and min values - if (vclk_clock_info_dict["min_clk"] != "N/A" or vclk_clock_info_dict["max_clk"] != "N/A") and clock_index == 0: + if vclk_clock_info_dict["min_clk"] != "N/A" and clock_index == 0: clocks[vclk_index]["min_clk"] = self.helpers.unit_format(self.logger, vclk_clock_info_dict["min_clk"], clock_unit) + if vclk_clock_info_dict["max_clk"] != "N/A" and clock_index == 0: clocks[vclk_index]["max_clk"] = self.helpers.unit_format(self.logger, vclk_clock_info_dict["max_clk"], clock_unit) - if (dclk_clock_info_dict["min_clk"] != "N/A" or dclk_clock_info_dict["max_clk"] != "N/A") and clock_index == 1: + if dclk_clock_info_dict["min_clk"] != "N/A" and clock_index == 1: clocks[dclk_index]["min_clk"] = self.helpers.unit_format(self.logger, dclk_clock_info_dict["min_clk"], clock_unit) + if dclk_clock_info_dict["max_clk"] != "N/A" and clock_index == 1: clocks[dclk_index]["max_clk"] = self.helpers.unit_format(self.logger, dclk_clock_info_dict["max_clk"], clock_unit) @@ -4234,7 +4355,10 @@ class AMDSMICommands(): self.logger.store_output(args.gpu, 'perfdeterminism', f"Successfully enabled performance determinism and set GFX clock frequency to {args.perf_determinism}") if args.compute_partition: + current_set_count = self.helpers.get_set_count() + future_set_count = 0 attempted_to_set = "N/A" + user_requested_partition_args = "N/A" try: (accelerator_set_choices, accelerator_profiles) = self.helpers.get_accelerator_choices_types_indices() logging.debug("args.compute_partition: %s; Accelerator_set_choices: %s", str(args.compute_partition), str(json.dumps(accelerator_set_choices, indent=4))) @@ -4242,20 +4366,30 @@ class AMDSMICommands(): compute_partition = amdsmi_interface.AmdSmiComputePartitionType[args.compute_partition] index = accelerator_profiles['profile_types'].index(args.compute_partition) attempted_to_set = f"Attempted to set accelerator partition to {args.compute_partition} (profile #{accelerator_profiles['profile_indices'][int(index)]}) on {gpu_string}" + user_requested_partition_args = f"{args.compute_partition} (profile #{accelerator_profiles['profile_indices'][int(index)]})" amdsmi_interface.amdsmi_set_gpu_compute_partition(args.gpu, compute_partition) - self.logger.store_output(args.gpu, 'accelerator_partition', f"Successfully set accelerator partition to {args.compute_partition} (profile #{accelerator_profiles['profile_indices'][int(index)]})") elif args.compute_partition in accelerator_profiles['profile_indices']: compute_partition = int(args.compute_partition) index = accelerator_profiles['profile_indices'].index(args.compute_partition) attempted_to_set = f"Attempted to set accelerator partition to {accelerator_profiles['profile_types'][int(index)]} (profile #{args.compute_partition}) on {gpu_string}" + user_requested_partition_args = f"{accelerator_profiles['profile_types'][int(index)]} (profile #{args.compute_partition})" amdsmi_interface.amdsmi_set_gpu_accelerator_partition_profile(args.gpu, compute_partition) - self.logger.store_output(args.gpu, 'accelerator_partition', f"Successfully set accelerator partition to {accelerator_profiles['profile_types'][int(index)]} (profile #{args.compute_partition})") else: raise ValueError(f"Invalid accelerator configuration {args.compute_partition} on {gpu_string}") + self.helpers.increment_set_count() + future_set_count = self.helpers.get_set_count() + if current_set_count == future_set_count-1: + self.logger.store_output(args.gpu, 'accelerator_partition', f"Successfully set accelerator partition to {user_requested_partition_args}") except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e + elif e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED: + self.helpers.increment_set_count() + future_set_count = self.helpers.get_set_count() + if current_set_count == future_set_count-1: + out = f"[AMDSMI_STATUS_NOT_SUPPORTED] Device does not support setting compute partition to {user_requested_partition_args}" + self.logger.store_output(args.gpu, 'accelerator_partition', out) elif e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_SETTING_UNAVAILABLE: print(f"\n{attempted_to_set}\n" f"\n[AMDSMI_STATUS_SETTING_UNAVAILABLE] Please check amd-smi partition --memory --accelerator for available profiles.\n" @@ -4327,7 +4461,7 @@ class AMDSMICommands(): if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_INVAL: - out = f"[AMDSMI_STATUS_INVAL] Unable to set memory partition to {args.memory_partition} on {gpu_string}" + out = f"[AMDSMI_STATUS_INVAL] Unable to set memory partition to {args.memory_partition}" print(f"Valid Memory partition Modes: {memory_dict['caps']}\n") self.logger.store_output(args.gpu, 'memory_partition', out) self.logger.print_output() @@ -4335,7 +4469,7 @@ class AMDSMICommands(): lock.release() return if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED: - out = f"[AMDSMI_STATUS_NOT_SUPPORTED] Device does not support setting memory partition to {args.memory_partition} on {gpu_string}" + out = f"[AMDSMI_STATUS_NOT_SUPPORTED] Device does not support setting memory partition to {args.memory_partition}" self.logger.store_output(args.gpu, 'memory_partition', out) self.logger.print_output() self.logger.clear_multiple_devices_output() @@ -4348,7 +4482,7 @@ class AMDSMICommands(): thread.terminate() thread.join() if timesToRetryRestartErr < 0: - out = f"[AMDSMI_STATUS_AMDGPU_RESTART_ERR] Could not successfully restart driver after applying {args.memory_partition} on {gpu_string}" + out = f"[AMDSMI_STATUS_AMDGPU_RESTART_ERR] Could not successfully restart driver after applying {args.memory_partition}" self.logger.store_output(args.gpu, 'memory_partition', out) self.logger.print_output() self.logger.clear_multiple_devices_output() @@ -5064,7 +5198,7 @@ class AMDSMICommands(): gpu_metric_version_str = json.dumps(gpu_metric_version_info, indent=4) logging.debug("GPU Metrics table Version for GPU %s | %s", gpu_id, gpu_metric_version_str) except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Unable to load GPU Metrics table version for GPU %s | %s", gpu_id, e.err_info) + logging.debug("#4 - Unable to load GPU Metrics table version for %s | %s", gpu_id, e.err_info) try: # Get GPU Metrics table @@ -5072,7 +5206,7 @@ class AMDSMICommands(): gpu_metric_str = json.dumps(gpu_metric_debug_info, indent=4) logging.debug("GPU Metrics table for GPU %s | %s", gpu_id, str(gpu_metric_str)) except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Unable to load GPU Metrics table for %s | %s", gpu_id, e.err_info) + logging.debug("#5 - Unable to load GPU Metrics table for %s | %s", gpu_id, e.err_info) # Store the pcie_bw values due to possible increase in bandwidth due to repeated gpu_metrics calls if args.pcie: @@ -5892,13 +6026,13 @@ class AMDSMICommands(): gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu) try: partition_dict = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile(gpu) + partition_id = str(partition_dict['partition_id']).replace("[", "").replace("]", "").replace(" ", "") profile_type = partition_dict['partition_profile']['profile_type'] profile_index = partition_dict['partition_profile']['profile_index'] - partition_id = str(partition_dict['partition_id']).replace("[", "").replace("]", "").replace(" ", "") except amdsmi_exception.AmdSmiLibraryException as e: profile_type = "N/A" profile_index = "N/A" - partition_id = "0" + partition_id = str(partition_dict['partition_id']).replace("[", "").replace("]", "").replace(" ", "") logging.debug("Failed to get accelerator partition profile for GPU %s | %s", gpu_id, e.get_error_info()) try: current_mem_cap = amdsmi_interface.amdsmi_get_gpu_memory_partition(gpu) @@ -5975,7 +6109,7 @@ class AMDSMICommands(): prev_gpu_id = "N/A" for gpu in args.gpu: gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu) - tabular_output_dict = {"gpu_id": "N/A", + tabular_output_dict = {"gpu_id": gpu_id, "profile_index": "N/A", "memory_partition_caps": "N/A", "accelerator_type": "N/A", @@ -5990,6 +6124,7 @@ class AMDSMICommands(): partition_dict = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile(gpu) partition_id = str(partition_dict['partition_id']).replace("[", "").replace("]", "").replace(" ", "") current_accelerator_type = partition_dict['partition_profile']['profile_type'] + tabular_output_dict["partition_id"] = partition_id # save only the primary GPU node's partition_id (the 1st listed device; non N/A one) # else keep current_partition_id unchanged for displaying in accelerator resource's output diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py index 6f869304ec..0e53d8b1cd 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py @@ -741,12 +741,25 @@ class AMDSMIHelpers(): accelerator_partition_profiles['memory_caps'].append(profile['profiles'][p]['memory_caps']) break # Only need to get the profiles for one device except amdsmi_interface.AmdSmiLibraryException as e: + logging.debug(f"AMDSMIHelpers.get_accelerator_partition_profile_config - Unable to get accelerator partition profile config for device {dev}: {str(e)}") + if e.err_code == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED: + logging.debug(f"AMDSMIHelpers.get_accelerator_partition_profile_config - Device {dev} does not support accelerator partition profiles") + return accelerator_partition_profiles + break + except Exception as e: + logging.debug(f"AMDSMIHelpers.get_accelerator_partition_profile_config - Unexpected error occured --> Unable to get accelerator partition profile config for device {dev}: {str(e)}") break return accelerator_partition_profiles def get_accelerator_choices_types_indices(self): return_val = ("N/A", {'profile_indices':[], 'profile_types':[]}) + if os.geteuid() != 0: + logging.debug("AMDSMIHelpers.get_accelerator_choices_types_indices - Not root, unable to get accelerator partition profiles") + # If not root, we can't get the accelerator partition profiles + return return_val + else: + logging.debug("AMDSMIHelpers.get_accelerator_choices_types_indices - Root, getting accelerator partition profiles") accelerator_partition_profiles = self.get_accelerator_partition_profile_config() if len(accelerator_partition_profiles['profile_types']) != 0: compute_partitions_str = accelerator_partition_profiles['profile_types'] + accelerator_partition_profiles['profile_indices'] @@ -787,11 +800,15 @@ class AMDSMIHelpers(): power_cap_min = amdsmi_interface.MaxUIntegerTypes.UINT64_T # start out at max and min and then find real min and max power_cap_max = 0 for dev in device_handles: - power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(dev) - if power_cap_info['max_power_cap'] > power_cap_max: - power_cap_max = power_cap_info['max_power_cap'] - if power_cap_info['min_power_cap'] < power_cap_max: - power_cap_min = power_cap_info['min_power_cap'] + try: + power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(dev) + if power_cap_info['max_power_cap'] > power_cap_max: + power_cap_max = power_cap_info['max_power_cap'] + if power_cap_info['min_power_cap'] < power_cap_max: + power_cap_min = power_cap_info['min_power_cap'] + except amdsmi_interface.AmdSmiLibraryException as e: + logging.debug(f"AMDSMIHelpers.get_power_caps - Unable to get power cap info for device {dev}: {str(e)}") + continue return (power_cap_min, power_cap_max) diff --git a/projects/amdsmi/include/amd_smi/impl/amd_smi_gpu_device.h b/projects/amdsmi/include/amd_smi/impl/amd_smi_gpu_device.h index ebe1756537..a919726cd1 100644 --- a/projects/amdsmi/include/amd_smi/impl/amd_smi_gpu_device.h +++ b/projects/amdsmi/include/amd_smi/impl/amd_smi_gpu_device.h @@ -60,6 +60,8 @@ class AMDSmiGPUDevice: public AMDSmiProcessor { pthread_mutex_t* get_mutex(); uint32_t get_gpu_id() const; uint32_t get_gpu_fd() const; + uint32_t get_card_id(); // -e feature + we can get card_id for our internal functions + uint32_t get_drm_render_minor(); // -e feature + we can get card_id for our internal functions std::string& get_gpu_path(); amdsmi_bdf_t get_bdf(); bool check_if_drm_is_supported() { return drm_.check_if_drm_is_supported(); } @@ -80,9 +82,7 @@ class AMDSmiGPUDevice: public AMDSmiProcessor { amdsmi_status_t amdgpu_query_driver_date(std::string& date) const; // New methods for -e feature - std::string bdf_to_string() const; - uint32_t get_card_from_bdf() const; - uint32_t get_render_id() const; + std::string bdf_to_string() const; // -e feature private: uint32_t gpu_id_; @@ -91,6 +91,8 @@ class AMDSmiGPUDevice: public AMDSmiProcessor { amdsmi_bdf_t bdf_; uint32_t vendor_id_; AMDSmiDrm& drm_; + uint32_t card_index_; + uint32_t drm_render_minor_; GPUComputeProcessList_t compute_process_list_; int32_t get_compute_process_list_impl(GPUComputeProcessList_t& compute_process_list, ComputeProcessListType_t list_type); diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index 1940acb669..7d6d75fb85 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -2173,13 +2173,17 @@ def amdsmi_get_clock_info( ) ) - return { - "clk": clock_measure.clk, - "min_clk": clock_measure.min_clk, - "max_clk": clock_measure.max_clk, - "clk_locked": clock_measure.clk_locked, - "clk_deep_sleep" : clock_measure.clk_deep_sleep, + clk_type_str = AmdSmiClkType(clock_type).name + + dict_ret = { + "clk": _validate_if_max_uint(clock_measure.clk, MaxUIntegerTypes.UINT32_T), + "min_clk": _validate_if_max_uint(clock_measure.min_clk, MaxUIntegerTypes.UINT32_T), + "max_clk": _validate_if_max_uint(clock_measure.max_clk, MaxUIntegerTypes.UINT32_T), + "clk_locked": _validate_if_max_uint(clock_measure.clk_locked, MaxUIntegerTypes.UINT8_T, isBool=True), + "clk_deep_sleep" : _validate_if_max_uint(clock_measure.clk_deep_sleep, MaxUIntegerTypes.UINT8_T, isBool=True), } + logging.debug("amdsmi_interface.py | amdsmi_get_clock_info | clk_type = " + clk_type_str + " | return_dictionary = \n" + str(json.dumps(dict_ret, indent=4))) + return dict_ret def amdsmi_get_gpu_bad_page_info( @@ -3129,51 +3133,76 @@ def amdsmi_get_gpu_accelerator_partition_profile( raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) + exception_caught = False length = 8 partition_id = [0, 0, 0, 0, 0, 0, 0, 0] partition_id_list = (ctypes.c_uint32 * length)(*partition_id) profile = amdsmi_wrapper.amdsmi_accelerator_partition_profile_t() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_accelerator_partition_profile(processor_handle, - ctypes.byref(profile), partition_id_list) - ) - profile_type_ret = amdsmi_wrapper.amdsmi_accelerator_partition_type_t__enumvalues[profile.profile_type].replace("AMDSMI_ACCELERATOR_PARTITION_", "") - profile_type_ret = profile_type_ret.replace("INVALID", "N/A") - - length = profile.num_partitions partition_ids = [] - - #partition_id[0] will contain the partition id of each device - #BM/Guest will include this logic. Host will only display primary partition ids. kPOSITION_OF_PARTITION_ID = 0 - partition_ids.append(partition_id_list[kPOSITION_OF_PARTITION_ID]) - mem_caps_list = [] - if profile.memory_caps.nps_flags.nps1_cap == 1: - mem_caps_list.append("NPS1") - if profile.memory_caps.nps_flags.nps2_cap == 1: - mem_caps_list.append("NPS2") - if profile.memory_caps.nps_flags.nps4_cap == 1: - mem_caps_list.append("NPS4") - if profile.memory_caps.nps_flags.nps8_cap == 1: - mem_caps_list.append("NPS8") + ret = amdsmi_wrapper.amdsmi_get_gpu_accelerator_partition_profile(processor_handle, + ctypes.byref(profile), partition_id_list) + if ret == amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED: + #partition_id[0] will contain the partition id of each device + #BM/Guest will include this logic. Host will only display primary partition ids. + partition_ids.append(partition_id_list[kPOSITION_OF_PARTITION_ID]) - partition_profile_dict = { - "profile_type" : profile_type_ret, - "num_partitions" : profile.num_partitions, - "profile_index" : profile.profile_index, - "memory_caps": mem_caps_list, - "num_resources" : profile.num_resources, - "resources" : "N/A" - } - return_dictionary = { - "partition_id" : partition_ids, - "partition_profile" : partition_profile_dict - } - - logging.debug("amdsmi_interface.py | amdsmi_get_gpu_accelerator_partition_profile | return_dictionary = \n" + str(json.dumps(return_dictionary, indent=4))) - return return_dictionary + try: + _check_res(ret) + except AmdSmiException as e: + logging.debug("amdsmi_interface.py | amdsmi_get_gpu_accelerator_partition_profile | exception_caught >> " + str(e)) + partition_profile_dict = { + "profile_type" : "N/A", + "num_partitions" : "N/A", + "profile_index" : "N/A", + "memory_caps": "N/A", + "num_resources" : "N/A", + "resources" : "N/A" + } + return_dictionary = { + "partition_id" : partition_ids, + "partition_profile" : partition_profile_dict + } + if ret == amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED: + exception_caught = True + else: + _check_res(ret) # re-raise the exception if error is anything other than AMDSMI_STATUS_NOT_SUPPORTED + # this ensures we can get partition ID even if the profile is not supported. + finally: + if exception_caught: + logging.debug("amdsmi_interface.py | exception_caught >> amdsmi_get_gpu_accelerator_partition_profile | return_dictionary = \n" + str(json.dumps(return_dictionary, indent=4))) + return return_dictionary + else: + profile_type_ret = amdsmi_wrapper.amdsmi_accelerator_partition_type_t__enumvalues[profile.profile_type].replace("AMDSMI_ACCELERATOR_PARTITION_", "") + profile_type_ret = profile_type_ret.replace("INVALID", "N/A") + length = profile.num_partitions + #partition_id[0] will contain the partition id of each device + #BM/Guest will include this logic. Host will only display primary partition ids. + partition_ids.append(partition_id_list[kPOSITION_OF_PARTITION_ID]) + mem_caps_list = [] + if profile.memory_caps.nps_flags.nps1_cap == 1: + mem_caps_list.append("NPS1") + if profile.memory_caps.nps_flags.nps2_cap == 1: + mem_caps_list.append("NPS2") + if profile.memory_caps.nps_flags.nps4_cap == 1: + mem_caps_list.append("NPS4") + if profile.memory_caps.nps_flags.nps8_cap == 1: + mem_caps_list.append("NPS8") + partition_profile_dict = { + "profile_type" : profile_type_ret, + "num_partitions" : profile.num_partitions, + "profile_index" : profile.profile_index, + "memory_caps": mem_caps_list, + "num_resources" : profile.num_resources, + "resources" : "N/A" + } + return_dictionary = { + "partition_id" : partition_ids, + "partition_profile" : partition_profile_dict + } + logging.debug("amdsmi_interface.py | amdsmi_get_gpu_accelerator_partition_profile | return_dictionary = \n" + str(json.dumps(return_dictionary, indent=4))) + return return_dictionary def amdsmi_get_gpu_accelerator_partition_profile_config(processor_handle: amdsmi_wrapper.amdsmi_processor_handle) -> Dict: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): @@ -4131,11 +4160,14 @@ def amdsmi_get_clk_freq( ) ) + clk_type_str = AmdSmiClkType(clk_type).name + dict_ret = { "num_supported": freq.num_supported, "current": freq.current, "frequency": list(freq.frequency)[: freq.num_supported], } + logging.debug("amdsmi_interface.py | amdsmi_get_clk_freq | clk_type = " + clk_type_str + " | return_dictionary = \n" + str(json.dumps(dict_ret, indent=4))) return dict_ret diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h index 1535a320e9..09c9e54cb0 100644 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h @@ -1445,6 +1445,35 @@ typedef union id { }; } rsmi_func_id_value_t; +/** + * @struct rsmi_device_identifiers_t + * @brief Structure to hold various identifiers for a GPU device. + * + * @details This structure contains fields that uniquely identify a GPU device, + * including its card index, DRM render minor, PCI Bus/Device/Function ID (BDFID), + * KFD GPU ID, partition ID, and SMI device ID. + */ +typedef struct { + //!< The card index of the device. + uint32_t card_index; + //!< The DRM render minor number of the device. + uint32_t drm_render_minor; + + //!< The PCI Bus/Device/Function identifier (BDFID) of the device. + uint64_t bdfid; + + //!< The KFD (Kernel Fusion Driver) GPU ID of the device. + uint64_t kfd_gpu_id; + + //!< The partition ID of the device. + uint32_t partition_id; + + //!< The SMI (System Management Interface) device ID. + uint32_t smi_device_id; + + uint32_t reserved[10]; +} rsmi_device_identifiers_t; + /*****************************************************************************/ /** @defgroup InitShutAdmin Initialization and Shutdown @@ -2009,6 +2038,36 @@ rsmi_status_t rsmi_dev_guid_get(uint32_t dv_ind, uint64_t *guid); */ rsmi_status_t rsmi_dev_node_id_get(uint32_t dv_ind, uint32_t *node_id); +/** + * @brief Retrieves the device identifiers for a specific GPU device. + * + * @details This function retrieves various identifiers for a GPU device, such as + * the card index, DRM render minor, BDFID, KFD GPU ID, partition ID, and SMI device ID. + * The identifiers are written to the provided `rsmi_device_identifiers_t` structure. + * + * @param[in] dv_ind a device index. + * + * @param[out] identifiers A pointer to a structure of type `rsmi_device_identifiers_t` + * where the device identifiers will be stored. The structure + * contains fields such as: + * - `card_index`: The card index of the device. + * - `drm_render_minor`: The DRM render minor number. + * - `bdfid`: The Bus/Device/Function PCI identifier. + * - `kfd_gpu_id`: The KFD GPU ID. + * - `partition_id`: The partition ID of the device. + * - `smi_device_id`: The SMI device ID. + * + * @retval ::RSMI_STATUS_SUCCESS The call was successful, and the device identifiers were retrieved. + * @retval ::RSMI_STATUS_NOT_SUPPORTED The installed software or hardware does not support this function + * with the given arguments. + * @retval ::RSMI_STATUS_INVALID_ARGS The provided arguments are invalid. + * + * @note Ensure that the `identifiers` pointer is valid and points to a properly allocated structure + * before calling this function. + */ +rsmi_status_t rsmi_dev_device_identifiers_get(uint32_t dv_ind, + rsmi_device_identifiers_t *identifiers); + /** @} */ // end of IDQuer diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_device.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_device.h index 70049644d3..190533d3ab 100644 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_device.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_device.h @@ -200,14 +200,6 @@ class Device { public: explicit Device(std::string path, RocmSMI_env_vars const *e); ~Device(void); - typedef struct { - uint32_t card_index; - uint32_t drm_render_minor; - uint64_t bdfid; - uint64_t kfd_gpu_id; - uint32_t partition_id; - uint32_t smi_device_id; - } rsmi_device_identifiers_t; void set_monitor(std::shared_ptr m) {monitor_ = m;} std::string path(void) const {return path_;} diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_main.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_main.h index 777b2bb773..68e55a27cd 100644 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_main.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_main.h @@ -33,6 +33,7 @@ #include #include // NOLINT #include +#include #include "rocm_smi/rocm_smi_io_link.h" #include "rocm_smi/rocm_smi_kfd.h" @@ -109,6 +110,15 @@ class RocmSMI { io_link_map_; std::map dev_ind_to_node_ind_map_; void AddToDeviceList(std::string dev_name, uint64_t bdfid = 0); + typedef struct { + uint32_t card_index = std::numeric_limits::max(); + std::string dev_name = ""; + std::string drm_render_path = ""; + std::string drm_card_path = ""; + uint32_t drm_render_minor = std::numeric_limits::max(); + uint64_t bdfid = std::numeric_limits::max(); + } rsmi_device_enumeration_t; + rsmi_status_t AddToDeviceList2(rsmi_device_enumeration_t device); void GetEnvVariables(void); std::shared_ptr FindMonitor(std::string monitor_path); diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi.cc b/projects/amdsmi/rocm_smi/src/rocm_smi.cc index dd80577b14..7626b3b64e 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi.cc @@ -6569,8 +6569,10 @@ rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id) { std::string strCompPartition = "UNKNOWN"; const uint32_t PARTITION_LEN = 10; char compute_partition[PARTITION_LEN]; + compute_partition[0] = '\0'; rsmi_status_t ret = rsmi_dev_compute_partition_get(dv_ind, compute_partition, PARTITION_LEN); if (ret == RSMI_STATUS_SUCCESS) { + strCompPartition.clear(); strCompPartition = compute_partition; } uint64_t pci_id = UINT64_MAX; @@ -6583,11 +6585,12 @@ rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id) { bdf_sstream << std::hex << std::setfill('0') << std::setw(4) << ((pci_id >> 32) & 0xFFFFFFFF) << ":"; bdf_sstream << std::hex << std::setfill('0') << std::setw(2) << ((pci_id >> 8) & 0xFF) << ":"; - bdf_sstream << std::hex << std::setfill('0') << std::setw(2) << ((pci_id >> 3) & 0xF8) << "."; + bdf_sstream << std::hex << std::setfill('0') << std::setw(2) << ((pci_id >> 3) & 0x1F) << "."; bdf_sstream << std::hex << std::setfill('0') << +(pci_id & 0x7); - bdf_sstream << "\nPartition ID ((pci_id >> 28) & 0xf): " << std::dec + bdf_sstream << "\n[Option 1] Partition ID ((pci_id >> 28) & 0xf): " << std::dec << static_cast((pci_id >> 28) & 0xf); - bdf_sstream << "\nPartition ID (pci_id & 0x7): " << std::dec << static_cast(pci_id & 0x7); + bdf_sstream << "\n[Option 2] Partition ID (pci_id & 0x7): " << std::dec + << static_cast(pci_id & 0x7); // std::cout << __PRETTY_FUNCTION__ << " BDF: " << bdf_sstream.str() << std::endl; /** @@ -6605,15 +6608,18 @@ rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id) { * bits [7:3] = Device * bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes */ + + // If the partition_id is still not set (bits [31:28]), we will use the fallback + // in function bits. We will use bits [2:0] as the partition ID. if (*partition_id != UINT32_MAX && *partition_id == 0 && - (strCompPartition == "DPX" || strCompPartition == "TPX" - || strCompPartition == "CPX" || strCompPartition == "QPX")) { + static_cast(pci_id & 0x7) != 0) { *partition_id = static_cast(pci_id & 0x7); } ss << __PRETTY_FUNCTION__ << " | ======= end ======= " << " | Success" << " | Device #: " << dv_ind + << " | Compute Partition: " << strCompPartition << " | Type: partition_id" << " | Data: " << static_cast(*partition_id) << " | Returning = " @@ -7487,6 +7493,21 @@ rsmi_dev_metrics_log_get(uint32_t dv_ind) CATCH } +rsmi_status_t rsmi_dev_device_identifiers_get(uint32_t dv_ind, + rsmi_device_identifiers_t *smi_device_identifiers) { + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); + GET_DEV_FROM_INDX + if (smi_device_identifiers == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } + rsmi_status_t ret = RSMI_STATUS_NOT_SUPPORTED; + return ret = dev->get_smi_device_identifiers(dv_ind, smi_device_identifiers); + CATCH +} + // UNDOCUMENTED FUNCTIONS // This functions are not declared in rocm_smi.h. They are either not fully diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc index ab81a76659..1b490a7135 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc @@ -1809,6 +1809,7 @@ std::string Device::readBootPartitionState( rsmi_status_t Device::get_smi_device_identifiers(uint32_t device_id, rsmi_device_identifiers_t *device_identifiers) { bool found_device = false; + std::ostringstream ss; rsmi_status_t ret = RSMI_STATUS_NOT_SUPPORTED; if (device_identifiers == nullptr) { return RSMI_STATUS_INVALID_ARGS; @@ -1816,20 +1817,38 @@ rsmi_status_t Device::get_smi_device_identifiers(uint32_t device_id, amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); auto devices = smi.devices(); + ss << __PRETTY_FUNCTION__ << " | device_id = " << device_id + << "; devices.size() = " << devices.size(); + // std::cout << ss.str() << "\n"; + LOG_DEBUG(ss); for (uint32_t i = 0; i < devices.size(); i++) { if (i != device_id) { continue; } - rsmi_device_identifiers_t smi_device; - smi_device.card_index = devices[i]->index(); - smi_device.drm_render_minor = devices[i]->drm_render_minor(); - smi_device.bdfid = devices[i]->bdfid(); - smi_device.kfd_gpu_id = devices[i]->kfd_gpu_id(); - smi_device.partition_id = devices[i]->m_partition_id; - smi_device.smi_device_id = i; - *device_identifiers = smi_device; + + device_identifiers->card_index = devices[i]->index(); + device_identifiers->drm_render_minor = devices[i]->drm_render_minor(); + device_identifiers->bdfid = devices[i]->bdfid(); + device_identifiers->kfd_gpu_id = devices[i]->kfd_gpu_id(); + uint32_t temp_partition_id = 0; + rsmi_status_t ret = rsmi_dev_partition_id_get( + i, &temp_partition_id); + if (ret != RSMI_STATUS_SUCCESS) { + temp_partition_id = 0; + } + device_identifiers->partition_id = temp_partition_id; + device_identifiers->smi_device_id = i; found_device = true; + ss << __PRETTY_FUNCTION__ << " | Found device: " + << "card_index = " << device_identifiers->card_index + << "; drm_render_minor = " << device_identifiers->drm_render_minor + << "; bdfid = " << std::hex << "0x" << device_identifiers->bdfid + << "; kfd_gpu_id = " << std::dec << device_identifiers->kfd_gpu_id + << "; partition_id = " << device_identifiers->partition_id + << "; smi_device_id = " << device_identifiers->smi_device_id; + // std::cout << ss.str() << "\n"; + LOG_DEBUG(ss); break; } if (found_device) { diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc index 8a2de8e761..374aeee11f 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc @@ -4570,8 +4570,13 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t* smu) { dev->set_smi_device_id(dv_ind); uint32_t partition_id = 0; - rsmi_dev_partition_id_get(dv_ind, &partition_id); - dev->set_smi_partition_id(partition_id); + auto ret = rsmi_dev_partition_id_get(dv_ind, &partition_id); + if (ret == RSMI_STATUS_SUCCESS) { + dev->set_smi_partition_id(partition_id); + } else { + dev->set_smi_partition_id(0); + } + dev->dev_log_gpu_metrics(ostrstream); const auto [error_code, external_metrics] = dev->dev_copy_internal_to_external_metrics(); diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_main.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_main.cc index aa98de89a0..b9f0435ebc 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_main.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_main.cc @@ -53,8 +53,6 @@ static const char *kPathDRMRoot = "/sys/class/drm"; static const char *kPathHWMonRoot = "/sys/class/hwmon"; static const char *kPathPowerRoot = "/sys/kernel/debug/dri"; -static const char *kDeviceNamePrefix = "card"; - static const char *kAMDMonitorTypes[] = {"radeon", "amdgpu", ""}; namespace amd { @@ -107,6 +105,44 @@ static uint32_t GetDrmRenderMinor(const std::string s) { return static_cast(drm_minor); } +// Find the drm minor from from sysfs path "/sys/class/drm/renderDX/device/drm". +// From the directory cardN in that sysfs path, the card number can be +// computed for renderDX. +// On success, return drm_minor which is >= 128 otherwise return 0xFFFFFFFF +static uint32_t GetCard(const std::string s) { + std::ostringstream ss; + std::string drm_path = s; + int card_num = -1; + const std::string card_file_prefix = "card"; + const uint64_t prefix_size = card_file_prefix.size(); + drm_path += "/device/drm"; + + auto card_dir = opendir(drm_path.c_str()); + if (card_dir == nullptr) + return static_cast(-1); + + auto dentry = readdir(card_dir); + + while (dentry != nullptr) { + std::string card_file = dentry->d_name; + if (!card_file.compare(0, prefix_size, card_file_prefix)) { + card_num = stoi(card_file.substr(prefix_size)); + if (card_num) + break; + } + dentry = readdir(card_dir); + } + + if (closedir(card_dir)) { + return static_cast(-1); + } + + ss << __PRETTY_FUNCTION__ << " | Discovered card = " + << std::to_string(card_num) << " | For drm_path = " << drm_path << " | "; + LOG_DEBUG(ss); + return static_cast(card_num); +} + // Determine if provided string is a bdfid pci path directory of the form // XXXX:XX:XX.X, // domain:bus:device.function @@ -170,12 +206,13 @@ static bool bdfid_from_path(const std::string in_name, uint64_t *bdfid) { // 0 = successful bdfid found // 1 = not a good bdfid found -static uint32_t ConstructBDFID(std::string path, uint64_t *bdfid) { +[[maybe_unused]] static uint32_t ConstructBDFID(std::string path, uint64_t *bdfid) { + std::ostringstream ss; assert(bdfid != nullptr); const unsigned int MAX_BDF_LENGTH = 512; char tpath[MAX_BDF_LENGTH] = {'\0'}; ssize_t ret; - memset(tpath,0,MAX_BDF_LENGTH); + memset(tpath, 0, MAX_BDF_LENGTH); ret = readlink(path.c_str(), tpath, MAX_BDF_LENGTH); @@ -183,6 +220,12 @@ static uint32_t ConstructBDFID(std::string path, uint64_t *bdfid) { assert(ret < MAX_BDF_LENGTH); if (ret <= 0 || ret >= MAX_BDF_LENGTH) { + ss << __PRETTY_FUNCTION__ << " | readlink failed for path = " + << path << " | ret = " << ret + << " | errno = " << errno + << " | error = " << strerror(errno); + // std::cout << ss.str() << std::endl; + LOG_ERROR(ss); return 1; } @@ -200,11 +243,19 @@ static uint32_t ConstructBDFID(std::string path, uint64_t *bdfid) { tmp = tpath_str.substr(slash_i + 1, end_i - slash_i); if (bdfid_from_path(tmp, bdfid)) { + ss << __PRETTY_FUNCTION__ << " | Found bdfid = " + << print_int_as_hex(*bdfid, true, 8) << " | from path = " + << path << " | tmp = " << tmp << std::endl; + LOG_INFO(ss); return 0; } end_i = slash_i - 1; } - + ss << __PRETTY_FUNCTION__ << " | No valid bdfid found in path = " + << path << " | tpath = " << tpath + << " | errno = " << errno + << " | error = " << strerror(errno) << std::endl; + LOG_ERROR(ss); return 1; } @@ -253,41 +304,8 @@ RocmSMI::Initialize(uint64_t flags) { "DiscoverAmdgpuDevices() failed."); } - uint64_t bdfid; - for (auto & device : devices_) { - if (ConstructBDFID(device->path(), &bdfid) != 0) { - std::cerr << "Failed to construct BDFID." << std::endl; - ret = 1; - } else if (device->bdfid() != UINT64_MAX && device->bdfid() != bdfid) { - // handles secondary partitions - compute partition feature nodes - ss << __PRETTY_FUNCTION__ - << " | [before] device->path() = " << device->path() - << "\n | bdfid = " << bdfid - << "\n | device->bdfid() = " << device->bdfid() - << " (" << print_int_as_hex(device->bdfid()) << ")" - << "\n | (xgmi node) setting to setting " - << "device->set_bdfid(device->bdfid())"; - LOG_TRACE(ss); - device->set_bdfid(device->bdfid()); - } else { - // legacy & pcie card updates - ss << __PRETTY_FUNCTION__ - << " | [before] device->path() = " << device->path() - << "\n | bdfid = " << bdfid - << "\n | device->bdfid() = " << device->bdfid() - << " (" << print_int_as_hex(device->bdfid()) << ")" - << "\n | (legacy/pcie card) setting device->set_bdfid(bdfid)"; - LOG_TRACE(ss); - device->set_bdfid(bdfid); - } - ss << __PRETTY_FUNCTION__ - << " | [after] device->path() = " << device->path() - << "\n | bdfid = " << bdfid - << "\n | device->bdfid() = " << device->bdfid() - << " (" << print_int_as_hex(device->bdfid()) << ")" - << "\n | final update: device->bdfid() holds correct device bdf"; - LOG_TRACE(ss); - } + ss << __PRETTY_FUNCTION__ << " | about to sort by BDF..." << std::endl; + LOG_DEBUG(ss); std::shared_ptr dev; // Sort index based on the BDF, collect BDF id firstly. @@ -382,6 +400,7 @@ RocmSMI::Initialize(uint64_t flags) { // displayAppTmpFilesContent(); std::string amdGPUDeviceList = displayAllDevicePaths(devices_); ss << __PRETTY_FUNCTION__ << " | current device paths = " << amdGPUDeviceList; + // std::cout << ss.str() << std::endl; LOG_DEBUG(ss); } @@ -623,9 +642,11 @@ RocmSMI::FindMonitor(std::string monitor_path) { } void RocmSMI::AddToDeviceList(std::string dev_name, uint64_t bdfid) { + static const int BYTE = 8; std::ostringstream ss; ss << __PRETTY_FUNCTION__ << " | ======= start ======="; LOG_TRACE(ss); + auto dev_path = std::string(kPathDRMRoot); dev_path += "/"; dev_path += dev_name; @@ -637,7 +658,8 @@ void RocmSMI::AddToDeviceList(std::string dev_name, uint64_t bdfid) { const std::string& d_name = dev_name; uint32_t card_indx = GetDeviceIndex(d_name); - dev->set_drm_render_minor(GetDrmRenderMinor(dev_path)); + uint32_t drmRenderMinor = GetDrmRenderMinor(dev_path); + dev->set_drm_render_minor(drmRenderMinor); dev->set_card_index(card_indx); GetSupportedEventGroups(card_indx, dev->supported_event_groups()); if (bdfid != 0) { @@ -646,16 +668,120 @@ void RocmSMI::AddToDeviceList(std::string dev_name, uint64_t bdfid) { devices_.push_back(dev); ss << __PRETTY_FUNCTION__ - << " | Adding to device list dev_name = " << dev_name - << " | path = " << dev_path - << " | bdfid = " << bdfid - << " | card index = " << std::to_string(card_indx) << " | "; + << " | Adding to device list dev_name = " << dev_name << "\n" + << " | path = " << dev_path << "\n" + << " | dName = " << d_name << "\n" + << " | bdfid = " << (bdfid == UINT64_MAX ? + "N/A" : print_int_as_hex(bdfid, true, 2*BYTE)) << "\n" + << " | card index = " << std::to_string(card_indx) << "\n" + << " | drmRenderMinor = " << std::to_string(drmRenderMinor) << "\n" + << " | supported_event_groups = " << dev->supported_event_groups() << "\n"; + // std::cout << ss.str() << std::endl; LOG_DEBUG(ss); } +// AddToDeviceList2 is used to add a device to the device list. +// [precondition] a. Iterate through KFD to find all accessible devices. +// [precondition] b. Provide BDFID of the device & the device path (card or render path) +// 1. Provide to function: +// [optional; Will populate] rsmi_device_enumeration_t->card_index +// [optional; Will populate +// if card or render path provided] rsmi_device_enumeration_t->dev_name +// [optional; Will populate] rsmi_device_enumeration_t->drm_render_path +// [optional; Will populate] rsmi_device_enumeration_t->drm_card_path +// [optional; Will populate] rsmi_device_enumeration_t->drm_render_minor +// [Required] rsmi_device_enumeration_t->bdfid +rsmi_status_t RocmSMI::AddToDeviceList2(RocmSMI::rsmi_device_enumeration_t device) { + static const int BYTE = 8; + std::ostringstream ss; + + ss << __PRETTY_FUNCTION__ << " | ======= start =======" + << "\n | card index = [" << std::to_string(device.card_index) << "]\n" + << " | dev_name = [" << device.dev_name << "]\n" + << " | drm_render_path = [" << device.drm_render_path << "]\n" + << " | drm_card_path = [" << device.drm_card_path << "]\n" + << " | drm_render_minor = [" << std::to_string(device.drm_render_minor) + << "]\n | bdfid (value) = [" << (device.bdfid == UINT64_MAX ? + "N/A" : print_int_as_hex(device.bdfid, true, 4*BYTE)) << "]\n" + << " | bdfid (str) = [" + << std::hex << std::setfill('0') << std::setw(4) + << ((device.bdfid >> 32) & static_cast(0xFFFFFFFF)) << ":" + << std::hex << std::setfill('0') << std::setw(2) << ((device.bdfid >> 8) + & static_cast(0xFF)) << ":" + << std::hex << std::setfill('0') << std::setw(2) << ((device.bdfid >> 3) + & static_cast(0x1F)) << "." + << std::hex << std::setfill('0') << std::setw(1) << +(device.bdfid + & static_cast(0x7)) << "]\n"; + // std::cout << ss.str() << std::endl; + LOG_TRACE(ss); + auto dev_path = std::string(kPathDRMRoot); + + if (device.dev_name.empty()) { + ss << __PRETTY_FUNCTION__ << " | dev_name is empty"; + // std::cout << ss.str() << std::endl; + LOG_DEBUG(ss); + + dev_path += "/"; + dev_path += ("renderD" + std::to_string(device.drm_render_minor)); + uint32_t card_num = GetCard(dev_path); + device.dev_name = "card" + std::to_string(card_num); + device.drm_render_path = dev_path; + device.drm_card_path = std::string(kPathDRMRoot) + "/card" + + std::to_string(card_num); + device.card_index = card_num; + } + + auto dev = std::make_shared(dev_path, &env_vars_); + + std::shared_ptr m = FindMonitor(dev_path + "/device/hwmon"); + dev->set_monitor(m); + + const std::string& d_name = device.dev_name; + uint32_t card_indx = GetDeviceIndex(d_name); + uint32_t drmRenderMinor = GetDrmRenderMinor(dev_path); + dev->set_drm_render_minor(drmRenderMinor); + dev->set_card_index(card_indx); + GetSupportedEventGroups(card_indx, dev->supported_event_groups()); + if (device.bdfid != 0) { + dev->set_bdfid(device.bdfid); + } + + devices_.push_back(dev); + ss << __PRETTY_FUNCTION__ + << " | Adding to device list dev_name = " << device.dev_name << "\n" + << " | path = " << dev_path << "\n" + << " | dName = " << d_name << "\n" + << " | bdfid = " << (device.bdfid == UINT64_MAX ? + "N/A" : print_int_as_hex(device.bdfid, true, 8*BYTE)) << "\n" + << " | card index = " << std::to_string(card_indx) << "\n" + << " | drmRenderMinor = " << std::to_string(drmRenderMinor) << "\n" + << " | supported_event_groups = " << dev->supported_event_groups() << "\n"; + ss << " | ======= rsmi_device_enumeration_t details =======\n" + << " | card index = [" << std::to_string(device.card_index) << "]\n" + << " | dev_name = [" << device.dev_name << "]\n" + << " | drm_render_path = [" << device.drm_render_path << "]\n" + << " | drm_card_path = [" << device.drm_card_path << "]\n" + << " | drm_render_minor = [" << std::to_string(device.drm_render_minor) + << "]\n | bdfid (value) = [" << (device.bdfid == UINT64_MAX ? + "N/A" : print_int_as_hex(device.bdfid, true, 8*BYTE)) << "]\n" + << " | bdfid (str) = [" + << std::hex << std::setfill('0') << std::setw(4) + << ((device.bdfid >> 32) & static_cast(0xFFFFFFFF)) << ":" + << std::hex << std::setfill('0') << std::setw(2) << ((device.bdfid >> 8) + & static_cast(0xFF)) << ":" + << std::hex << std::setfill('0') << std::setw(2) << ((device.bdfid >> 3) + & static_cast(0x1F)) << "." + << std::hex << std::setfill('0') << std::setw(1) << +(device.bdfid + & static_cast(0x7)) << "]\n" + << " | END"; + // std::cout << ss.str() << std::endl; + LOG_DEBUG(ss); + return RSMI_STATUS_SUCCESS; +} + static const uint32_t kAmdGpuId = 0x1002; -static bool isAMDGPU(std::string dev_path) { +[[maybe_unused]] static bool isAMDGPU(std::string dev_path) { bool isAmdGpu = false; std::ostringstream ss; std::string vend_path = dev_path + "/device/vendor"; @@ -691,44 +817,73 @@ static bool isAMDGPU(std::string dev_path) { return isAmdGpu; } +uint32_t GetLargestNodeNumber(const std::string& path = "/sys/class/kfd/kfd/topology/nodes/") { + std::ostringstream ss; + uint32_t largest_node_number = 0; + + // Open the directory + DIR* dir = opendir(path.c_str()); + if (!dir) { + // Return UINT32_MAX on error + ss << __PRETTY_FUNCTION__ << " | Failed to open directory: " << path + << " | errno = " << errno + << " | error = " << strerror(errno); + // std::cout << ss.str() << std::endl; + LOG_ERROR(ss); + return UINT32_MAX; + } + + struct dirent* entry; + while ((entry = readdir(dir)) != nullptr) { + // Skip "." and ".." + if (entry->d_name[0] == '.') { + continue; + } + + // Check if the directory name is a number + std::string dir_name(entry->d_name); + if (std::all_of(dir_name.begin(), dir_name.end(), ::isdigit)) { + uint32_t node_number = static_cast(std::stoul(dir_name)); + largest_node_number = std::max(largest_node_number, node_number); + } + } + + if (closedir(dir)) { + // Return UINT32_MAX on error + ss << __PRETTY_FUNCTION__ << " | Failed to close directory: " << path + << " | errno = " << errno + << " | error = " << strerror(errno); + // std::cout << ss.str() << std::endl; + LOG_ERROR(ss); + return UINT32_MAX; + } + + return largest_node_number; +} + uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { std::string err_msg; - uint32_t count = 0; - int32_t cardId = 0; - int32_t max_cardId = -1; std::ostringstream ss; // If this gets called more than once, clear previous findings. devices_.clear(); monitors_.clear(); - auto drm_dir = opendir(kPathDRMRoot); - if (drm_dir == nullptr) { - err_msg = "Failed to open drm root directory "; - err_msg += kPathDRMRoot; - err_msg += "."; - perror(err_msg.c_str()); + uint32_t max_nodes = GetLargestNodeNumber(); + ss << __PRETTY_FUNCTION__ << " | Discovered a potential of " + << std::to_string(max_nodes) << " kfd nodes"; + // std::cout << ss.str() << std::endl; + LOG_DEBUG(ss); + if (max_nodes == UINT32_MAX) { + ss << __PRETTY_FUNCTION__ << " | Failed to get largest node number"; + // std::cout << ss.str() << std::endl; + LOG_ERROR(ss); return 1; } - - auto dentry = readdir(drm_dir); - - while (dentry != nullptr) { - if (memcmp(dentry->d_name, kDeviceNamePrefix, strlen(kDeviceNamePrefix)) - == 0) { - if ((strcmp(dentry->d_name, ".") == 0) || - (strcmp(dentry->d_name, "..") == 0)) - continue; - sscanf(&dentry->d_name[strlen(kDeviceNamePrefix)], "%d", &cardId); - if (cardId > max_cardId) - max_cardId = cardId; - count++; - } - dentry = readdir(drm_dir); - } - ss << __PRETTY_FUNCTION__ << " | Discovered a potential of " - << std::to_string(count) << " cards" << " | "; - LOG_DEBUG(ss); + // Iterate through all nodes + // and read all properties + // under /sys/class/kfd/kfd/topology/nodes/ + // and add to systemNodes vector struct systemNode { uint32_t s_node_id = 0; @@ -741,24 +896,27 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { uint8_t s_device = 0; uint8_t s_function = 0; uint8_t s_partition_id = 0; + uint32_t s_drm_render_minor = 0; uint64_t padding = 0; // padding added in case new changes in future }; - // allSystemNodes[key = unique_id] => {node_id, gpu_id, unique_id, - // location_id, bdf, domain, bus, device, - // partition_id} std::multimap allSystemNodes; std::set gpuNodeIdsFound; + std::vector systemNodes; uint32_t node_id = 0; static const int BYTE = 8; - while (true) { - uint64_t gpu_id = 0, unique_id = 0, location_id = 0, domain = 0; + while (node_id <= max_nodes) { + ss << __PRETTY_FUNCTION__ << " | node_id = " << std::to_string(node_id); + // std::cout << ss.str() << std::endl; + LOG_DEBUG(ss); + uint64_t gpu_id = 0, unique_id = 0, location_id = 0, domain = 0, render_d = 0; int ret_gpu_id = get_gpu_id(node_id, &gpu_id); int ret_unique_id = read_node_properties(node_id, "unique_id", &unique_id); int ret_loc_id = read_node_properties(node_id, "location_id", &location_id); int ret_domain = read_node_properties(node_id, "domain", &domain); + int ret_renderd = read_node_properties(node_id, "drm_render_minor", &render_d); bool isANode = (ret_gpu_id == 0 && - (ret_domain == 0 && ret_loc_id == 0)); + (ret_domain == 0 && ret_loc_id == 0 && ret_renderd == 0)); ss << __PRETTY_FUNCTION__ << " | isAGpuNode: " << (isANode ? "TRUE" : "FALSE") << "; is_vm_guest(): " << (is_vm_guest() ? "TRUE" : "FALSE") @@ -766,11 +924,13 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { << "; ret_domain: " << ret_domain << "; ret_loc_id: " << ret_loc_id << "; ret_unique_id: " << ret_unique_id + << "\nret_renderd: " << ret_renderd << "\n[node_id = " << print_unsigned_hex_and_int(node_id) << "\n" << "; gpu_id = " << print_unsigned_hex_and_int(gpu_id) << "\n" << "; unique_id = " << print_unsigned_hex_and_int(unique_id) << "\n" << "; location_id = " << print_unsigned_hex_and_int(location_id) << "\n" - << "; domain = " << print_unsigned_hex_and_int(domain) + << "; domain = " << print_unsigned_hex_and_int(domain) << "\n" + << "; drm_render_minor = " << print_unsigned_hex_and_int(render_d) << "]\n"; LOG_DEBUG(ss); if (isANode || (is_vm_guest() && ret_gpu_id == 0)) { @@ -783,14 +943,11 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { myNode.s_location_id = location_id; myNode.s_domain = domain & 0xFFFFFFFF; myNode.s_bdf = (myNode.s_domain << 32) | (myNode.s_location_id); - myNode.s_location_id = myNode.s_bdf; - myNode.s_bdf |= ((domain & 0xFFFFFFFF) << 32); - myNode.s_location_id = myNode.s_bdf; - myNode.s_domain = myNode.s_location_id >> 32; myNode.s_bus = ((myNode.s_location_id >> 8) & 0xFF); myNode.s_device = ((myNode.s_location_id >> 3) & 0x1F); myNode.s_function = myNode.s_location_id & 0x7; myNode.s_partition_id = ((myNode.s_location_id >> 28) & 0xF); + myNode.s_drm_render_minor = static_cast((ret_renderd == 0) ? render_d : 0); if (gpu_id != 0) { // only add gpu nodes, 0 = CPU auto ret = gpuNodeIdsFound.insert(node_id); if (ret.second != false) { @@ -807,292 +964,45 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { << "; unique_id = " << print_unsigned_hex_and_int(unique_id) << "\n" << "; location_id = " << print_unsigned_hex_and_int(location_id) << "\n" << "; domain = " << print_unsigned_hex_and_int(domain) << "\n" + << "; bus = " << print_unsigned_hex_and_int(myNode.s_bus) << "\n" + << "; device = " << print_unsigned_hex_and_int(myNode.s_device) << "\n" + << "; function = " << print_unsigned_hex_and_int(myNode.s_function) << "\n" + << "; partition_id = " << print_unsigned_hex_and_int(myNode.s_partition_id) << "\n" + << "; bdf = " << print_unsigned_hex_and_int(myNode.s_bdf) << "\n" + << "; drm_render_minor = " << print_unsigned_hex_and_int(myNode.s_drm_render_minor) << "]\n"; LOG_DEBUG(ss); } - allSystemNodes.emplace(unique_id, myNode); + systemNodes.push_back(myNode); } - } else { - break; } node_id++; } ss << __PRETTY_FUNCTION__ << " | Ordered system nodes found = {"; - for (auto i : allSystemNodes) { - ss << "\n[node_id = " << std::to_string(i.second.s_node_id) - << "; gpu_id = " << std::to_string(i.second.s_gpu_id) - << "; unique_id = " << std::to_string(i.second.s_unique_id) - << "; location_id = " << std::to_string(i.second.s_location_id) - << "; bdf = " << print_int_as_hex(i.second.s_bdf) - << "; domain = " << print_int_as_hex(i.second.s_domain, true, 2*BYTE) - << "; bus = " << print_int_as_hex(i.second.s_bus, true, BYTE) - << "; device = " << print_int_as_hex(i.second.s_device, true, BYTE) - << "; function = " << std::to_string(i.second.s_function) - << "; partition_id = " << std::to_string(i.second.s_partition_id) - << "], "; + + for (auto i : systemNodes) { + ss << "\n[node_id = " << std::to_string(i.s_node_id) << "\n" + << "; gpu_id = " << std::to_string(i.s_gpu_id) << "\n" + << "; unique_id = " << std::to_string(i.s_unique_id) << "\n" + << "; location_id = " << std::to_string(i.s_location_id) << "\n" + << "; bdf = " << print_int_as_hex(i.s_bdf) << "\n" + << "; domain = " << print_int_as_hex(i.s_domain, true, 2*BYTE) << "\n" + << "; bus = " << print_int_as_hex(i.s_bus, true, BYTE) << "\n" + << "; device = " << print_int_as_hex(i.s_device, true, BYTE) << "\n" + << "; function = " << std::to_string(i.s_function) << "\n" + << "; partition_id = " << std::to_string(i.s_partition_id) << "\n" + << "; drm_render_minor = " << std::to_string(i.s_drm_render_minor) + << "], \n"; + rsmi_device_enumeration_t rsmi_device; + rsmi_device.dev_name = ""; + rsmi_device.bdfid = i.s_bdf; + rsmi_device.drm_render_minor = i.s_drm_render_minor; + AddToDeviceList2(rsmi_device); } ss << "}"; + // std::cout << ss.str() << std::endl; LOG_DEBUG(ss); - - uint32_t cardAdded = 0; - // Discover all root cards & gpu partitions associated with each - for (int32_t cardId = 0; cardId <= max_cardId; cardId++) { - std::string path = kPathDRMRoot; - path += "/card"; - path += std::to_string(cardId); - uint64_t primary_unique_id = 0; - uint64_t device_uuid = 0; - bool doesDeviceSupportPartitions = false; - // get current partition - int kSize = 256; - char computePartition[kSize]; - std::string strCompPartition = "UNKNOWN"; - uint32_t numMonDevices = 0; - rsmi_num_monitor_devices(&numMonDevices); - - // each identified gpu card node is a primary node for - // potential matching unique ids - if (isAMDGPU(path) || - (init_options_ & RSMI_INIT_FLAG_ALL_GPUS)) { - std::string d_name = "card"; - d_name += std::to_string(cardId); - uint32_t numMonDevices = 0; - rsmi_num_monitor_devices(&numMonDevices); - if (rsmi_dev_compute_partition_get(cardAdded, computePartition, kSize) - == RSMI_STATUS_SUCCESS) { - strCompPartition = computePartition; - doesDeviceSupportPartitions = true; - } - rsmi_status_t ret_unique_id = - rsmi_dev_unique_id_get(cardAdded, &device_uuid); - auto temp_numb_nodes = allSystemNodes.count(device_uuid); - auto primaryBdfId = - allSystemNodes.lower_bound(device_uuid)->second.s_location_id; - auto i = allSystemNodes.lower_bound(device_uuid); - if (doesDeviceSupportPartitions && temp_numb_nodes > 1 - && ret_unique_id == RSMI_STATUS_SUCCESS) { - // helps identify xgmi nodes (secondary nodes) easier - ss << __PRETTY_FUNCTION__ << " | secondary node add ; " - << " BDF = " << std::to_string(primaryBdfId) - << " (" << print_int_as_hex(primaryBdfId) << ")"; - LOG_DEBUG(ss); - if (doesDeviceSupportPartitions && strCompPartition != "SPX" - && i->second.s_partition_id == 0) { - i->second.s_partition_id = i->second.s_function; - ss << __PRETTY_FUNCTION__ << " | (secondary node add) fall back - " - << "detected !SPX && partition_id == 0" - << "; function = " << std::to_string(i->second.s_function) - << "; partition_id = " << std::to_string(i->second.s_partition_id); - LOG_DEBUG(ss); - } - ss << __PRETTY_FUNCTION__ - << " | (secondary node add) B4 AddToDeviceList() -->" - << "\n[node_id = " << std::to_string(i->second.s_node_id) - << "; gpu_id = " << std::to_string(i->second.s_gpu_id) - << "; unique_id = " << std::to_string(i->second.s_unique_id) - << "; location_id = " << std::to_string(i->second.s_location_id) - << "; bdf = " << print_int_as_hex(i->second.s_bdf) - << "; domain = " << print_int_as_hex(i->second.s_domain, true, 2*BYTE) - << "; bus = " << print_int_as_hex(i->second.s_bus, true, BYTE) - << "; device = " << print_int_as_hex(i->second.s_device, true, BYTE) - << "; function = " << std::to_string(i->second.s_function) - << "; partition_id = " << std::to_string(i->second.s_partition_id) - << "], "; - LOG_DEBUG(ss); - ss << __PRETTY_FUNCTION__ << " | AddToDeviceList #1 (secondary node) \n" - << "; bdf: " << print_unsigned_hex_and_int(primaryBdfId) << "\n"; - LOG_DEBUG(ss); - } else { - ss << __PRETTY_FUNCTION__ << " | primary node add ; " - << " BDF = " << std::to_string(UINT64_MAX); - if (doesDeviceSupportPartitions && strCompPartition != "SPX" - && i->second.s_partition_id == 0) { - i->second.s_partition_id = i->second.s_function; - ss << __PRETTY_FUNCTION__ << " | (primary node add) fall back - " - << "detected !SPX && partition_id == 0" - << "; function = " << std::to_string(i->second.s_function) - << "; partition_id = " << std::to_string(i->second.s_partition_id); - LOG_DEBUG(ss); - } - LOG_DEBUG(ss); - ss << __PRETTY_FUNCTION__ - << " | (primary node add) After AddToDeviceList() -->" - << "\n[node_id = " << std::to_string(i->second.s_node_id) - << "; gpu_id = " << std::to_string(i->second.s_gpu_id) - << "; unique_id = " << std::to_string(i->second.s_unique_id) - << "; location_id = " << std::to_string(i->second.s_location_id) - << "; bdf = " << print_int_as_hex(i->second.s_bdf) - << "; domain = " << print_int_as_hex(i->second.s_domain, true, 2*BYTE) - << "; bus = " << print_int_as_hex(i->second.s_bus, true, BYTE) - << "; device = " << print_int_as_hex(i->second.s_device, true, BYTE) - << "; function = " << std::to_string(i->second.s_function) - << "; partition_id = " << std::to_string(i->second.s_partition_id) - << "], "; - LOG_DEBUG(ss); - ss << __PRETTY_FUNCTION__ << " | AddToDeviceList #2 (primary node) \n" - << "; bdf: " << print_unsigned_hex_and_int(UINT64_MAX) << "\n"; - LOG_DEBUG(ss); - AddToDeviceList(d_name, UINT64_MAX); - } - - ss << __PRETTY_FUNCTION__ - << " | Ordered system nodes seen in lookup = {"; - for (auto i : allSystemNodes) { - ss << "\n[node_id = " << std::to_string(i.second.s_node_id) - << "; gpu_id = " << std::to_string(i.second.s_gpu_id) - << "; unique_id = " << std::to_string(i.second.s_unique_id) - << "; location_id = " << std::to_string(i.second.s_location_id) - << "; bdf = " << print_int_as_hex(i.second.s_bdf) - << "; domain = " << print_int_as_hex(i.second.s_domain, true, 2*BYTE) - << "; bus = " << print_int_as_hex(i.second.s_bus, true, BYTE) - << "; device = " << print_int_as_hex(i.second.s_device, true, BYTE) - << "; function = " << std::to_string(i.second.s_function) - << "; partition_id = " << std::to_string(i.second.s_partition_id) - << "], "; - } - ss << "}"; - LOG_DEBUG(ss); - - uint64_t temp_primary_unique_id = 0; - if (allSystemNodes.empty()) { - cardAdded++; - ss << __PRETTY_FUNCTION__ - << " | allSystemNodes.empty() = true, continue..."; - LOG_DEBUG(ss); - continue; - } - - // get current partition - rsmi_num_monitor_devices(&numMonDevices); - if (rsmi_dev_compute_partition_get(cardAdded, computePartition, kSize) - == RSMI_STATUS_SUCCESS) { - strCompPartition = computePartition; - } - if (rsmi_dev_unique_id_get(cardAdded, &device_uuid) - != RSMI_STATUS_SUCCESS) { - cardAdded++; - allSystemNodes.erase(device_uuid); - ss << __PRETTY_FUNCTION__ - << " | rsmi_dev_unique_id_get(cardId, &device_uuid)" - << " was not successful, continue.. "; - LOG_DEBUG(ss); - continue; - } - - temp_primary_unique_id = - allSystemNodes.find(device_uuid)->second.s_unique_id; - temp_numb_nodes = allSystemNodes.count(temp_primary_unique_id); - - ss << __PRETTY_FUNCTION__ - << " | device/node id (cardId) = " << std::to_string(cardId) - << " | card id (cardAdded) = " << std::to_string(cardAdded) - << " | numMonDevices = " << std::to_string(numMonDevices) - << " | compute partition = " << strCompPartition - << " | temp_primary_unique_id = " - << std::to_string(temp_primary_unique_id) - << " | Num of nodes matching temp_primary_unique_id = " - << temp_numb_nodes - << " | device_uuid (hex/uint) = " - << print_unsigned_hex_and_int(device_uuid) - << " | device_uuid (uint64_t) = " << device_uuid; - LOG_DEBUG(ss); - - if (temp_primary_unique_id != 0) { - primary_unique_id = temp_primary_unique_id; - } else { - cardAdded++; - // remove already added nodes associated with current card - allSystemNodes.erase(0); - continue; - } - - auto numb_nodes = allSystemNodes.count(primary_unique_id); - ss << __PRETTY_FUNCTION__ << " | REFRESH - primary_unique_id = " - << std::to_string(primary_unique_id) << " has " - << std::to_string(numb_nodes) << " known gpu nodes"; - LOG_DEBUG(ss); - while (numb_nodes > 1) { - std::string secNode = "card"; - secNode += std::to_string(cardId); // maps the primary node card to - // secondary - allows get/sets - auto it = allSystemNodes.lower_bound(device_uuid); - auto it_end = allSystemNodes.upper_bound(device_uuid); - if (numb_nodes == temp_numb_nodes) { - auto removalNodeId = it->second.s_node_id; - auto removalGpuId = it->second.s_gpu_id; - auto removalUniqueId = it->second.s_unique_id; - auto removalLocId = it->second.s_location_id; - auto removaldomain = it->second.s_domain; - auto nodesErased = 1; - allSystemNodes.erase(it++); - ss << __PRETTY_FUNCTION__ - << "\nPRIMARY --> num_nodes == temp_numb_nodes; ERASING " - << std::to_string(nodesErased) << " node -> [node_id = " - << std::to_string(removalNodeId) - << "; gpu_id = " << std::to_string(removalGpuId) - << "; unique_id = " << std::to_string(removalUniqueId) - << "; location_id = " << std::to_string(removalLocId) - << "; removaldomain = " << std::to_string(removaldomain) - << "]"; - LOG_DEBUG(ss); - } - if (it == it_end) { - break; - } - auto myBdfId = it->second.s_location_id; - ss << __PRETTY_FUNCTION__ << " | secondary node add #2; " - << " BDF = " << std::to_string(myBdfId) - << " (" << print_int_as_hex(myBdfId) << ")"; - LOG_DEBUG(ss); - if (doesDeviceSupportPartitions && strCompPartition != "SPX" - && it->second.s_partition_id == 0) { - it->second.s_partition_id = it->second.s_function; - ss << __PRETTY_FUNCTION__ << " | (secondary node add #2) fall back - " - << "detected !SPX && partition_id == 0" - << "; function = " << std::to_string(it->second.s_function) - << "; partition_id = " << std::to_string(it->second.s_partition_id); - LOG_DEBUG(ss); - } - ss << __PRETTY_FUNCTION__ - << " | (secondary node add #2) B4 AddToDeviceList() -->" - << "\n[node_id = " << std::to_string(it->second.s_node_id) - << "; gpu_id = " << std::to_string(it->second.s_gpu_id) - << "; unique_id = " << std::to_string(it->second.s_unique_id) - << "; location_id = " << std::to_string(it->second.s_location_id) - << "; bdf = " << print_int_as_hex(it->second.s_bdf) - << "; domain = " << print_int_as_hex(it->second.s_domain, true, 2*BYTE) - << "; bus = " << print_int_as_hex(it->second.s_bus, true, BYTE) - << "; device = " << print_int_as_hex(it->second.s_device, true, BYTE) - << "; function = " << std::to_string(it->second.s_function) - << "; partition_id = " << std::to_string(it->second.s_partition_id) - << "], "; - LOG_DEBUG(ss); - ss << __PRETTY_FUNCTION__ << " | AddToDeviceList #3 (secondary node add #2) \n" - << "; bdf: " << print_unsigned_hex_and_int(myBdfId) << "\n"; - LOG_DEBUG(ss); - AddToDeviceList(secNode, myBdfId); - allSystemNodes.erase(it++); - numb_nodes--; - cardAdded++; - } - // remove any remaining nodes associated with current card - auto erasedNodes = allSystemNodes.erase(primary_unique_id); - ss << __PRETTY_FUNCTION__ << " | After finding primary_unique_id = " - << std::to_string(primary_unique_id) << " erased " - << std::to_string(erasedNodes) << " nodes"; - LOG_DEBUG(ss); - cardAdded++; - } - } - - if (closedir(drm_dir)) { - err_msg = "Failed to close drm root directory "; - err_msg += kPathDRMRoot; - err_msg += "."; - perror(err_msg.c_str()); - return 1; - } return 0; } diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index 4cef2ad6c5..ac50b11f31 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -619,10 +619,10 @@ amdsmi_get_gpu_enumeration_info(amdsmi_processor_handle processor_handle, } // Retrieve DRM Card ID - info->drm_card = gpu_device->get_card_from_bdf(); + info->drm_card = gpu_device->get_card_id(); // Retrieve DRM Render ID - info->drm_render = gpu_device->get_render_id(); + info->drm_render = gpu_device->get_drm_render_minor(); // Retrieve HIP ID (difference from the smallest node ID) and HSA ID std::map> nodes; @@ -2267,6 +2267,7 @@ amdsmi_get_gpu_accelerator_partition_profile_config(amdsmi_processor_handle proc << "\n profile_config->profiles[i].num_resources: " << profile_config->profiles[i].num_resources << std::endl; + // std::cout << ss.str() << std::endl; LOG_DEBUG(ss); } @@ -2425,6 +2426,7 @@ amdsmi_get_gpu_accelerator_partition_profile_config(amdsmi_processor_handle proc } ss << __PRETTY_FUNCTION__ << " | END returning " << smi_amdgpu_get_status_string(return_status, false); + // std::cout << ss.str() << std::endl; LOG_INFO(ss); return return_status; @@ -2791,6 +2793,9 @@ amdsmi_get_gpu_metrics_header_info(amdsmi_processor_handle processor_handle, { AMDSMI_CHECK_INIT(); // nullptr api supported + if (header_value != nullptr) { + *header_value = amd_metrics_table_header_t{}; // Use a default initializer for the struct + } return rsmi_wrapper(rsmi_dev_metrics_header_info_get, processor_handle, 0, reinterpret_cast(header_value)); @@ -2802,7 +2807,7 @@ amdsmi_status_t amdsmi_get_gpu_metrics_info( AMDSMI_CHECK_INIT(); // nullptr api supported if (pgpu_metrics != nullptr) { - *pgpu_metrics = {}; + *pgpu_metrics = amdsmi_gpu_metrics_t{}; // Use a default initializer for the struct } return rsmi_wrapper(rsmi_dev_gpu_metrics_info_get, processor_handle, 0, reinterpret_cast(pgpu_metrics)); @@ -3805,7 +3810,7 @@ amdsmi_get_gpu_cper_entries( return status; } std::string path = std::string("/sys/kernel/debug/dri/") + - std::to_string(gpu_device->get_card_from_bdf()) + + std::to_string(gpu_device->get_card_id()) + "/amdgpu_ring_cper"; @@ -3957,6 +3962,7 @@ amdsmi_status_t amdsmi_get_gpu_driver_info(amdsmi_processor_handle processor_han amdsmi_status_t amdsmi_get_pcie_info(amdsmi_processor_handle processor_handle, amdsmi_pcie_info_t *info) { AMDSMI_CHECK_INIT(); + std::ostringstream ss; if (info == nullptr) { return AMDSMI_STATUS_INVAL; @@ -3984,7 +3990,10 @@ amdsmi_status_t amdsmi_get_pcie_info(amdsmi_processor_handle processor_handle, a fscanf(fp, "%d", &pcie_width); fclose(fp); } else { - printf("Failed to open file: %s \n", path_max_link_width.c_str()); + ss << __PRETTY_FUNCTION__ + << " | Failed to open file: " << path_max_link_width + << " | returning AMDSMI_STATUS_API_FAILED"; + LOG_ERROR(ss); return AMDSMI_STATUS_API_FAILED; } info->pcie_static.max_pcie_width = (uint16_t)pcie_width; diff --git a/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc b/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc index db7183ff63..f2e03a6eac 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc @@ -42,6 +42,64 @@ uint32_t AMDSmiGPUDevice::get_gpu_id() const { return gpu_id_; } +uint32_t AMDSmiGPUDevice::get_card_id() { + std::ostringstream ss; + // Should never return not_supported, but just in case + rsmi_status_t ret = rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED; + uint32_t gpu_index = this->get_gpu_id(); + rsmi_device_identifiers_t identifiers = rsmi_device_identifiers_t{}; + ret = rsmi_dev_device_identifiers_get(gpu_index, &identifiers); + if (ret != rsmi_status_t::RSMI_STATUS_SUCCESS) { + this->card_index_ = std::numeric_limits::max(); + } else { + this->card_index_ = identifiers.card_index; + } + + ss << __PRETTY_FUNCTION__ + << " | rsmi_dev_identifiers_get status: " << getRSMIStatusString(ret, false) << "\n" + << " | gpu_id_: " << gpu_id_ << "\n" + << " | identifiers.card_index: " << identifiers.card_index << "\n" + << " | identifiers.drm_render_minor: " << identifiers.drm_render_minor << "\n" + << " | identifiers.bdfid: " << std::hex << "0x" << identifiers.bdfid << "\n" + << " | identifiers.kfd_gpu_id: " << std::dec << identifiers.kfd_gpu_id << "\n" + << " | identifiers.partition_id: " << identifiers.partition_id << "\n" + << " | identifiers.smi_device_id: " << identifiers.smi_device_id << "\n" + << " | returning card_index_: " + << this->card_index_ << std::endl; + // std::cout << ss.str(); + LOG_DEBUG(ss); + return this->card_index_; +} + +uint32_t AMDSmiGPUDevice::get_drm_render_minor() { + std::ostringstream ss; + // Should never return not_supported, but just in case + rsmi_status_t ret = rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED; + uint32_t gpu_index = this->get_gpu_id(); + rsmi_device_identifiers_t identifiers = rsmi_device_identifiers_t{}; + ret = rsmi_dev_device_identifiers_get(gpu_index, &identifiers); + if (ret != rsmi_status_t::RSMI_STATUS_SUCCESS) { + this->drm_render_minor_ = std::numeric_limits::max(); + } else { + this->drm_render_minor_ = identifiers.drm_render_minor; + } + + ss << __PRETTY_FUNCTION__ + << " | rsmi_dev_identifiers_get status: " << getRSMIStatusString(ret, false) << "\n" + << " | gpu_id_: " << gpu_id_ << "\n" + << " | identifiers.card_index: " << identifiers.card_index << "\n" + << " | identifiers.drm_render_minor: " << identifiers.drm_render_minor << "\n" + << " | identifiers.bdfid: " << std::hex << "0x" << identifiers.bdfid << "\n" + << " | identifiers.kfd_gpu_id: " << std::dec << identifiers.kfd_gpu_id << "\n" + << " | identifiers.partition_id: " << identifiers.partition_id << "\n" + << " | identifiers.smi_device_id: " << identifiers.smi_device_id << "\n" + << " | returning drm_render_minor_: " + << this->drm_render_minor_ << std::endl; + // std::cout << ss.str(); + LOG_DEBUG(ss); + return this->drm_render_minor_; +} + uint32_t AMDSmiGPUDevice::get_gpu_fd() const { return fd_; } @@ -323,81 +381,6 @@ std::string AMDSmiGPUDevice::bdf_to_string() const { } -uint32_t AMDSmiGPUDevice::get_card_from_bdf() const { - const std::string drm_path = "/sys/class/drm/"; - - DIR* dir = opendir(drm_path.c_str()); - if (!dir) { - return std::numeric_limits::max(); - } - - struct dirent* entry; - while ((entry = readdir(dir)) != nullptr) { - std::string device_name = entry->d_name; - - // Check if the entry starts with "card" - if (device_name.find("card") == 0) { - const std::string card_path = drm_path + device_name + "/device"; - - // Open the uevent file for the device - std::ifstream uevent_file(card_path + "/uevent"); - if (!uevent_file) { - continue; // Skip if the file is not found - } - - std::string line; - while (std::getline(uevent_file, line)) { - // Check for the PCI_SLOT_NAME and if it contains the BDF - if (line.rfind("PCI_SLOT_NAME", 0) == 0 && line.find(bdf_to_string()) != std::string::npos) { - closedir(dir); - return std::stoi(device_name.substr(4)); // Convert extracted number to int - } - } - } - } - - closedir(dir); - return std::numeric_limits::max(); // Return -1 if no matching card is found -} - -uint32_t AMDSmiGPUDevice::get_render_id() const { - const std::string drm_path = "/sys/class/drm/"; - - DIR* dir = opendir(drm_path.c_str()); - if (!dir) { - return std::numeric_limits::max(); - } - - struct dirent* entry; - while ((entry = readdir(dir)) != nullptr) { - std::string device_name = entry->d_name; - - // Check if the entry starts with "renderD" - if (device_name.find("renderD") == 0) { - const std::string render_path = drm_path + device_name + "/device"; - - // Open the uevent file for the device - std::ifstream uevent_file(render_path + "/uevent"); - if (!uevent_file) { - continue; // Skip if the file is not found - } - - std::string line; - while (std::getline(uevent_file, line)) { - // Check for the PCI_SLOT_NAME and if it contains the BDF - if (line.rfind("PCI_SLOT_NAME", 0) == 0 && line.find(bdf_to_string()) != std::string::npos) { - closedir(dir); - return std::stoi(device_name.substr(7)); // Extract only the number after "renderD" - } - } - } - } - - closedir(dir); - return std::numeric_limits::max(); // Return -1 if no matching render ID is found -} - - } // namespace smi } // namespace amd diff --git a/projects/amdsmi/src/amd_smi/amd_smi_utils.cc b/projects/amdsmi/src/amd_smi/amd_smi_utils.cc index c1f42ebf8d..b58e306d77 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi_utils.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi_utils.cc @@ -115,7 +115,7 @@ int openFileAndModifyBuffer(std::string path, char *buff, size_t sizeOfBuff, bool errorDiscovered = false; std::ifstream file(path, std::ifstream::in); std::string contents = {std::istreambuf_iterator{file}, std::istreambuf_iterator{}}; - clearCharBufferAndReinitialize(buff, sizeOfBuff, contents); + clearCharBufferAndReinitialize(buff, static_cast(sizeOfBuff), contents); if (!file.is_open()) { errorDiscovered = true; } else { @@ -453,21 +453,12 @@ amdsmi_status_t smi_amdgpu_get_bad_page_info(amd::smi::AMDSmiGPUDevice* device, return AMDSMI_STATUS_SUCCESS; } -static uint32_t GetDeviceIndex(const std::string s) { - std::string t = s; - size_t tmp = t.find_last_not_of("0123456789"); - t.erase(0, tmp+1); - - assert(stoi(t) >= 0); - return static_cast(stoi(t)); -} - amdsmi_status_t smi_amdgpu_get_bad_page_threshold(amd::smi::AMDSmiGPUDevice* device, uint32_t *threshold) { SMIGPUDEVICE_MUTEX(device->get_mutex()) //TODO: Accessing the node requires root privileges, and its interface may need to be exposed in another path - uint32_t index = GetDeviceIndex(device->get_gpu_path()); + uint32_t index = device->get_card_id(); std::string fullpath = "/sys/kernel/debug/dri/" + std::to_string(index) + std::string("/ras/bad_page_cnt_threshold"); std::ifstream fs(fullpath.c_str()); @@ -489,7 +480,6 @@ amdsmi_status_t smi_amdgpu_get_bad_page_threshold(amd::smi::AMDSmiGPUDevice* dev amdsmi_status_t smi_amdgpu_validate_ras_eeprom(amd::smi::AMDSmiGPUDevice* device) { SMIGPUDEVICE_MUTEX(device->get_mutex()) - //uint32_t index = GetDeviceIndex(device->get_gpu_path()); //TODO: need to expose the corresponding interface to validate the checksum of ras eeprom table. //verify fail: return AMDSMI_STATUS_CORRUPTED_EEPROM return AMDSMI_STATUS_NOT_SUPPORTED; diff --git a/projects/amdsmi/tests/amd_smi_test/functional/memorypartition_read_write.cc b/projects/amdsmi/tests/amd_smi_test/functional/memorypartition_read_write.cc index 55dfee2286..8d148186c2 100755 --- a/projects/amdsmi/tests/amd_smi_test/functional/memorypartition_read_write.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/memorypartition_read_write.cc @@ -297,7 +297,7 @@ void TestMemoryPartitionReadWrite::Run(void) { << "\n\t\tcurrent_profile.num_resources: " << current_profile.num_resources << std::endl; } - for (auto j = 0; j < current_profile.num_resources; j++) { + for (uint32_t j = 0; j < current_profile.num_resources; j++) { auto rp = profile_config.resource_profiles[resource_index]; IF_VERB(STANDARD) {