[SWDEV-518325/SWDEV-518320/SWDEV-443309] Fix Partition Enumeration
* Changes:
- Updates to DRM renderD* / card* pathing for partition
- Now use KFD to discover AMD devices and populate accordingly
Device MUST have an accessible KFD node (via cgroups)
- Updated serveral AMD SMI CLI outputs to handle SYSFS files
which are not accessible on partition nodes
- Tests are updated to handle not supported features
- Added new method to help get card/drm info
(rsmi_dev_device_identifiers_get) from ROCm SMI
- Renamed device->get_card_id() & device->get_drm_render_minor()
These can now be used on internal AMD SMI calls.
- Removed warnings shown in build
Change-Id: Ice882fd9b97fb625a5bd4ef327f3ceaf247dc570
Signed-off-by: Charis Poag <Charis.Poag@amd.com>
[ROCm/amdsmi commit: 4782528770]
Этот коммит содержится в:
коммит произвёл
Arif, Maisam
родитель
484614fe9b
Коммит
8d4a4d7b14
@@ -548,6 +548,7 @@ class AMDSMICommands():
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
power_limit_error = True
|
||||
max_power_limit = "N/A"
|
||||
min_power_limit = "N/A"
|
||||
socket_power_limit = "N/A"
|
||||
logging.debug("Failed to get power cap info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
@@ -1517,7 +1518,7 @@ class AMDSMICommands():
|
||||
gpu_metric_version_str = json.dumps(gpu_metric_version_info, indent=4)
|
||||
logging.debug("GPU Metrics table Version for GPU %s | %s", gpu_id, gpu_metric_version_str)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Unable to load GPU Metrics table version for GPU %s | %s", gpu_id, e.err_info)
|
||||
logging.debug("#1 - Unable to load GPU Metrics table version for %s | %s", gpu_id, e.err_info)
|
||||
|
||||
try:
|
||||
# Get GPU Metrics table
|
||||
@@ -1525,7 +1526,7 @@ class AMDSMICommands():
|
||||
gpu_metric_str = json.dumps(gpu_metric_debug_info, indent=4)
|
||||
logging.debug("GPU Metrics table for GPU %s | %s", gpu_id, str(gpu_metric_str))
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Unable to load GPU Metrics table for %s | %s", gpu_id, e.err_info)
|
||||
logging.debug("#2 - Unable to load GPU Metrics table for %s | %s", gpu_id, e.err_info)
|
||||
|
||||
logging.debug(f"Metric Arg information for GPU {gpu_id} on {self.helpers.os_info()}")
|
||||
logging.debug(f"Args: {current_platform_args}")
|
||||
@@ -1544,7 +1545,85 @@ class AMDSMICommands():
|
||||
# Get GPU Metrics table
|
||||
gpu_metric = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Unable to load GPU Metrics table for %s | %s", gpu_id, e.err_info)
|
||||
logging.debug("#3 - Unable to load GPU Metrics table for %s | %s", gpu_id, e.err_info)
|
||||
gpu_metric = {
|
||||
"temperature_edge": "N/A",
|
||||
"temperature_hotspot": "N/A",
|
||||
"temperature_mem": "N/A",
|
||||
"temperature_vrgfx": "N/A",
|
||||
"temperature_vrsoc": "N/A",
|
||||
"temperature_vrmem": "N/A",
|
||||
"average_gfx_activity": "N/A",
|
||||
"average_umc_activity": "N/A",
|
||||
"average_mm_activity": "N/A",
|
||||
"average_socket_power": "N/A",
|
||||
"energy_accumulator": "N/A",
|
||||
"system_clock_counter": "N/A",
|
||||
"average_gfxclk_frequency": "N/A",
|
||||
"average_socclk_frequency": "N/A",
|
||||
"average_uclk_frequency": "N/A",
|
||||
"average_vclk0_frequency": "N/A",
|
||||
"average_dclk0_frequency": "N/A",
|
||||
"average_vclk1_frequency": "N/A",
|
||||
"average_dclk1_frequency": "N/A",
|
||||
"current_gfxclk": "N/A",
|
||||
"current_socclk": "N/A",
|
||||
"current_uclk": "N/A",
|
||||
"current_vclk0": "N/A",
|
||||
"current_dclk0": "N/A",
|
||||
"current_vclk1": "N/A",
|
||||
"current_dclk1": "N/A",
|
||||
"throttle_status": "N/A",
|
||||
"current_fan_speed": "N/A",
|
||||
"pcie_link_width": "N/A",
|
||||
"pcie_link_speed": "N/A",
|
||||
"gfx_activity_acc": "N/A",
|
||||
"mem_activity_acc": "N/A",
|
||||
"temperature_hbm": "N/A",
|
||||
"firmware_timestamp": "N/A",
|
||||
"voltage_soc": "N/A",
|
||||
"voltage_gfx": "N/A",
|
||||
"voltage_mem": "N/A",
|
||||
"indep_throttle_status": "N/A",
|
||||
"current_socket_power": "N/A",
|
||||
"vcn_activity": "N/A",
|
||||
"gfxclk_lock_status": "N/A",
|
||||
"xgmi_link_width": "N/A",
|
||||
"xgmi_link_speed": "N/A",
|
||||
"pcie_bandwidth_acc": "N/A",
|
||||
"pcie_bandwidth_inst": "N/A",
|
||||
"pcie_l0_to_recov_count_acc": "N/A",
|
||||
"pcie_replay_count_acc": "N/A",
|
||||
"pcie_replay_rover_count_acc": "N/A",
|
||||
"xgmi_read_data_acc": "N/A",
|
||||
"xgmi_write_data_acc": "N/A",
|
||||
"current_gfxclks": "N/A",
|
||||
"current_socclks": "N/A",
|
||||
"current_vclk0s": "N/A",
|
||||
"current_dclk0s": "N/A",
|
||||
"jpeg_activity": "N/A",
|
||||
"pcie_nak_sent_count_acc": "N/A",
|
||||
"pcie_nak_rcvd_count_acc": "N/A",
|
||||
"accumulation_counter": "N/A",
|
||||
"prochot_residency_acc": "N/A",
|
||||
"ppt_residency_acc": "N/A",
|
||||
"socket_thm_residency_acc": "N/A",
|
||||
"vr_thm_residency_acc": "N/A",
|
||||
"hbm_thm_residency_acc": "N/A",
|
||||
"num_partition": "N/A",
|
||||
"xcp_stats.gfx_busy_inst": "N/A",
|
||||
"xcp_stats.jpeg_busy": "N/A",
|
||||
"xcp_stats.vcn_busy": "N/A",
|
||||
"xcp_stats.gfx_busy_acc": "N/A",
|
||||
"xcp_stats.gfx_below_host_limit_acc": "N/A",
|
||||
"xcp_stats.gfx_below_host_limit_ppt_acc": "N/A",
|
||||
"xcp_stats.gfx_below_host_limit_thm_acc": "N/A",
|
||||
"xcp_stats.gfx_low_utilization_acc": "N/A",
|
||||
"xcp_stats.gfx_below_host_limit_total_acc": "N/A",
|
||||
"pcie_lc_perf_other_end_recovery": "N/A",
|
||||
"vram_max_bandwidth": "N/A",
|
||||
"xgmi_link_status": "N/A",
|
||||
}
|
||||
|
||||
# Populate the pcie_dict first due to multiple gpu metrics calls incorrectly increasing bandwidth
|
||||
if "pcie" in current_platform_args:
|
||||
@@ -1828,24 +1907,35 @@ class AMDSMICommands():
|
||||
# Populate GFX clock values
|
||||
try:
|
||||
current_gfx_clocks = gpu_metric["current_gfxclks"]
|
||||
for clock_index, current_gfx_clock in enumerate(current_gfx_clocks):
|
||||
# If the current clock is N/A then nothing else applies
|
||||
if current_gfx_clock == "N/A":
|
||||
continue
|
||||
if current_gfx_clocks == "N/A":
|
||||
# If the current gfx clocks are not available, we cannot proceed further
|
||||
for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_GFX_CLKS):
|
||||
gfx_index = f"gfx_{clock_index}"
|
||||
clocks[gfx_index]["clk"] = "N/A"
|
||||
clocks[gfx_index]["min_clk"] = "N/A"
|
||||
clocks[gfx_index]["max_clk"] = "N/A"
|
||||
clocks[gfx_index]["clk_locked"] = "N/A"
|
||||
clocks[gfx_index]["deep_sleep"] = "N/A" # assume deep sleep if no clocks are available
|
||||
|
||||
gfx_index = f"gfx_{clock_index}"
|
||||
clocks[gfx_index]["clk"] = self.helpers.unit_format(self.logger,
|
||||
current_gfx_clock,
|
||||
clock_unit)
|
||||
|
||||
# Populate clock locked status
|
||||
if gpu_metric["gfxclk_lock_status"] != "N/A":
|
||||
gfx_clock_lock_flag = 1 << clock_index # This is the position of the clock lock flag
|
||||
if gpu_metric["gfxclk_lock_status"] & gfx_clock_lock_flag:
|
||||
clocks[gfx_index]["clk_locked"] = "ENABLED"
|
||||
else:
|
||||
clocks[gfx_index]["clk_locked"] = "DISABLED"
|
||||
except Exception as e:
|
||||
else:
|
||||
for clock_index, current_gfx_clock in enumerate(current_gfx_clocks):
|
||||
# If the current clock is N/A then nothing else applies
|
||||
if current_gfx_clock == "N/A":
|
||||
continue
|
||||
|
||||
gfx_index = f"gfx_{clock_index}"
|
||||
clocks[gfx_index]["clk"] = self.helpers.unit_format(self.logger,
|
||||
current_gfx_clock,
|
||||
clock_unit)
|
||||
|
||||
# Populate clock locked status
|
||||
if gpu_metric["gfxclk_lock_status"] != "N/A":
|
||||
gfx_clock_lock_flag = 1 << clock_index # This is the position of the clock lock flag
|
||||
if gpu_metric["gfxclk_lock_status"] & gfx_clock_lock_flag:
|
||||
clocks[gfx_index]["clk_locked"] = "ENABLED"
|
||||
else:
|
||||
clocks[gfx_index]["clk_locked"] = "DISABLED"
|
||||
except KeyError as e:
|
||||
logging.debug("Failed to get current_gfxclks for gpu %s | %s", gpu_id, e)
|
||||
|
||||
# Populate MEM clock value
|
||||
@@ -1861,31 +1951,51 @@ class AMDSMICommands():
|
||||
# Populate VCLK clock values
|
||||
try:
|
||||
current_vclk_clocks = gpu_metric["current_vclk0s"]
|
||||
for clock_index, current_vclk_clock in enumerate(current_vclk_clocks):
|
||||
# If the current clock is N/A then nothing else applies
|
||||
if current_vclk_clock == "N/A":
|
||||
continue
|
||||
if current_vclk_clocks == "N/A":
|
||||
# If the current vclk clocks are not available, we cannot proceed further
|
||||
for clock_index in range(kMAX_NUM_VCLKS):
|
||||
vclk_index = f"vclk_{clock_index}"
|
||||
clocks[vclk_index]["clk"] = "N/A"
|
||||
clocks[vclk_index]["min_clk"] = "N/A"
|
||||
clocks[vclk_index]["max_clk"] = "N/A"
|
||||
clocks[vclk_index]["clk_locked"] = "N/A"
|
||||
clocks[vclk_index]["deep_sleep"] = "N/A"
|
||||
else:
|
||||
for clock_index, current_vclk_clock in enumerate(current_vclk_clocks):
|
||||
# If the current clock is N/A then nothing else applies
|
||||
if current_vclk_clock == "N/A":
|
||||
continue
|
||||
|
||||
vclk_index = f"vclk_{clock_index}"
|
||||
clocks[vclk_index]["clk"] = self.helpers.unit_format(self.logger,
|
||||
current_vclk_clock,
|
||||
clock_unit)
|
||||
except Exception as e:
|
||||
vclk_index = f"vclk_{clock_index}"
|
||||
clocks[vclk_index]["clk"] = self.helpers.unit_format(self.logger,
|
||||
current_vclk_clock,
|
||||
clock_unit)
|
||||
except KeyError as e:
|
||||
logging.debug("Failed to get current_vclk0s for gpu %s | %s", gpu_id, e)
|
||||
|
||||
# Populate DCLK clock values
|
||||
try:
|
||||
current_dclk_clocks = gpu_metric["current_dclk0s"]
|
||||
for clock_index, current_dclk_clock in enumerate(current_dclk_clocks):
|
||||
# If the current clock is N/A then nothing else applies
|
||||
if current_dclk_clock == "N/A":
|
||||
continue
|
||||
if current_dclk_clocks == "N/A":
|
||||
# If the current dclk clocks are not available, we cannot proceed further
|
||||
for clock_index in range(kMAX_NUM_DCLKS):
|
||||
dclk_index = f"dclk_{clock_index}"
|
||||
clocks[dclk_index]["clk"] = "N/A"
|
||||
clocks[dclk_index]["min_clk"] = "N/A"
|
||||
clocks[dclk_index]["max_clk"] = "N/A"
|
||||
clocks[dclk_index]["clk_locked"] = "N/A"
|
||||
clocks[dclk_index]["deep_sleep"] = "N/A"
|
||||
else:
|
||||
for clock_index, current_dclk_clock in enumerate(current_dclk_clocks):
|
||||
# If the current clock is N/A then nothing else applies
|
||||
if current_dclk_clock == "N/A":
|
||||
continue
|
||||
|
||||
dclk_index = f"dclk_{clock_index}"
|
||||
clocks[dclk_index]["clk"] = self.helpers.unit_format(self.logger,
|
||||
current_dclk_clock,
|
||||
clock_unit)
|
||||
except Exception as e:
|
||||
dclk_index = f"dclk_{clock_index}"
|
||||
clocks[dclk_index]["clk"] = self.helpers.unit_format(self.logger,
|
||||
current_dclk_clock,
|
||||
clock_unit)
|
||||
except KeyError as e:
|
||||
logging.debug("Failed to get current_dclk0s for gpu %s | %s", gpu_id, e)
|
||||
|
||||
# Populate FCLK clock value; fclk not present in gpu_metrics so use amdsmi_get_clk_freq
|
||||
@@ -1902,10 +2012,19 @@ class AMDSMICommands():
|
||||
# Populate SOCCLK clock value
|
||||
try:
|
||||
current_socclk_clock = gpu_metric["current_socclk"]
|
||||
clocks["socclk_0"]["clk"] = self.helpers.unit_format(self.logger,
|
||||
current_socclk_clock,
|
||||
clock_unit)
|
||||
except Exception as e:
|
||||
if current_socclk_clock == "N/A":
|
||||
# If the current socclk clocks are not available, we cannot proceed further
|
||||
clocks["socclk_0"]["clk"] = "N/A"
|
||||
clocks["socclk_0"]["min_clk"] = "N/A"
|
||||
clocks["socclk_0"]["max_clk"] = "N/A"
|
||||
clocks["socclk_0"]["clk_locked"] = "N/A"
|
||||
clocks["socclk_0"]["deep_sleep"] = "N/A"
|
||||
else:
|
||||
# If the current clock is N/A then nothing else applies
|
||||
clocks["socclk_0"]["clk"] = self.helpers.unit_format(self.logger,
|
||||
current_socclk_clock,
|
||||
clock_unit)
|
||||
except KeyError as e:
|
||||
logging.debug("Failed to get current_socclk for gpu %s | %s", gpu_id, e)
|
||||
|
||||
# Populate the max and min clock values from sysfs
|
||||
@@ -1971,17 +2090,19 @@ class AMDSMICommands():
|
||||
logging.debug("Failed to get vclk1 and/or dclk1 clock info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
# if the current clock is N/A then we shouldn't populate the max and min values
|
||||
if (vclk_clock_info_dict["min_clk"] != "N/A" or vclk_clock_info_dict["max_clk"] != "N/A") and clock_index == 0:
|
||||
if vclk_clock_info_dict["min_clk"] != "N/A" and clock_index == 0:
|
||||
clocks[vclk_index]["min_clk"] = self.helpers.unit_format(self.logger,
|
||||
vclk_clock_info_dict["min_clk"],
|
||||
clock_unit)
|
||||
if vclk_clock_info_dict["max_clk"] != "N/A" and clock_index == 0:
|
||||
clocks[vclk_index]["max_clk"] = self.helpers.unit_format(self.logger,
|
||||
vclk_clock_info_dict["max_clk"],
|
||||
clock_unit)
|
||||
if (dclk_clock_info_dict["min_clk"] != "N/A" or dclk_clock_info_dict["max_clk"] != "N/A") and clock_index == 1:
|
||||
if dclk_clock_info_dict["min_clk"] != "N/A" and clock_index == 1:
|
||||
clocks[dclk_index]["min_clk"] = self.helpers.unit_format(self.logger,
|
||||
dclk_clock_info_dict["min_clk"],
|
||||
clock_unit)
|
||||
if dclk_clock_info_dict["max_clk"] != "N/A" and clock_index == 1:
|
||||
clocks[dclk_index]["max_clk"] = self.helpers.unit_format(self.logger,
|
||||
dclk_clock_info_dict["max_clk"],
|
||||
clock_unit)
|
||||
@@ -4234,7 +4355,10 @@ class AMDSMICommands():
|
||||
|
||||
self.logger.store_output(args.gpu, 'perfdeterminism', f"Successfully enabled performance determinism and set GFX clock frequency to {args.perf_determinism}")
|
||||
if args.compute_partition:
|
||||
current_set_count = self.helpers.get_set_count()
|
||||
future_set_count = 0
|
||||
attempted_to_set = "N/A"
|
||||
user_requested_partition_args = "N/A"
|
||||
try:
|
||||
(accelerator_set_choices, accelerator_profiles) = self.helpers.get_accelerator_choices_types_indices()
|
||||
logging.debug("args.compute_partition: %s; Accelerator_set_choices: %s", str(args.compute_partition), str(json.dumps(accelerator_set_choices, indent=4)))
|
||||
@@ -4242,20 +4366,30 @@ class AMDSMICommands():
|
||||
compute_partition = amdsmi_interface.AmdSmiComputePartitionType[args.compute_partition]
|
||||
index = accelerator_profiles['profile_types'].index(args.compute_partition)
|
||||
attempted_to_set = f"Attempted to set accelerator partition to {args.compute_partition} (profile #{accelerator_profiles['profile_indices'][int(index)]}) on {gpu_string}"
|
||||
user_requested_partition_args = f"{args.compute_partition} (profile #{accelerator_profiles['profile_indices'][int(index)]})"
|
||||
amdsmi_interface.amdsmi_set_gpu_compute_partition(args.gpu, compute_partition)
|
||||
self.logger.store_output(args.gpu, 'accelerator_partition', f"Successfully set accelerator partition to {args.compute_partition} (profile #{accelerator_profiles['profile_indices'][int(index)]})")
|
||||
elif args.compute_partition in accelerator_profiles['profile_indices']:
|
||||
compute_partition = int(args.compute_partition)
|
||||
index = accelerator_profiles['profile_indices'].index(args.compute_partition)
|
||||
attempted_to_set = f"Attempted to set accelerator partition to {accelerator_profiles['profile_types'][int(index)]} (profile #{args.compute_partition}) on {gpu_string}"
|
||||
user_requested_partition_args = f"{accelerator_profiles['profile_types'][int(index)]} (profile #{args.compute_partition})"
|
||||
amdsmi_interface.amdsmi_set_gpu_accelerator_partition_profile(args.gpu, compute_partition)
|
||||
self.logger.store_output(args.gpu, 'accelerator_partition', f"Successfully set accelerator partition to {accelerator_profiles['profile_types'][int(index)]} (profile #{args.compute_partition})")
|
||||
else:
|
||||
raise ValueError(f"Invalid accelerator configuration {args.compute_partition} on {gpu_string}")
|
||||
self.helpers.increment_set_count()
|
||||
future_set_count = self.helpers.get_set_count()
|
||||
if current_set_count == future_set_count-1:
|
||||
self.logger.store_output(args.gpu, 'accelerator_partition', f"Successfully set accelerator partition to {user_requested_partition_args}")
|
||||
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
|
||||
raise PermissionError('Command requires elevation') from e
|
||||
elif e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED:
|
||||
self.helpers.increment_set_count()
|
||||
future_set_count = self.helpers.get_set_count()
|
||||
if current_set_count == future_set_count-1:
|
||||
out = f"[AMDSMI_STATUS_NOT_SUPPORTED] Device does not support setting compute partition to {user_requested_partition_args}"
|
||||
self.logger.store_output(args.gpu, 'accelerator_partition', out)
|
||||
elif e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_SETTING_UNAVAILABLE:
|
||||
print(f"\n{attempted_to_set}\n"
|
||||
f"\n[AMDSMI_STATUS_SETTING_UNAVAILABLE] Please check amd-smi partition --memory --accelerator for available profiles.\n"
|
||||
@@ -4327,7 +4461,7 @@ class AMDSMICommands():
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
|
||||
raise PermissionError('Command requires elevation') from e
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_INVAL:
|
||||
out = f"[AMDSMI_STATUS_INVAL] Unable to set memory partition to {args.memory_partition} on {gpu_string}"
|
||||
out = f"[AMDSMI_STATUS_INVAL] Unable to set memory partition to {args.memory_partition}"
|
||||
print(f"Valid Memory partition Modes: {memory_dict['caps']}\n")
|
||||
self.logger.store_output(args.gpu, 'memory_partition', out)
|
||||
self.logger.print_output()
|
||||
@@ -4335,7 +4469,7 @@ class AMDSMICommands():
|
||||
lock.release()
|
||||
return
|
||||
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED:
|
||||
out = f"[AMDSMI_STATUS_NOT_SUPPORTED] Device does not support setting memory partition to {args.memory_partition} on {gpu_string}"
|
||||
out = f"[AMDSMI_STATUS_NOT_SUPPORTED] Device does not support setting memory partition to {args.memory_partition}"
|
||||
self.logger.store_output(args.gpu, 'memory_partition', out)
|
||||
self.logger.print_output()
|
||||
self.logger.clear_multiple_devices_output()
|
||||
@@ -4348,7 +4482,7 @@ class AMDSMICommands():
|
||||
thread.terminate()
|
||||
thread.join()
|
||||
if timesToRetryRestartErr < 0:
|
||||
out = f"[AMDSMI_STATUS_AMDGPU_RESTART_ERR] Could not successfully restart driver after applying {args.memory_partition} on {gpu_string}"
|
||||
out = f"[AMDSMI_STATUS_AMDGPU_RESTART_ERR] Could not successfully restart driver after applying {args.memory_partition}"
|
||||
self.logger.store_output(args.gpu, 'memory_partition', out)
|
||||
self.logger.print_output()
|
||||
self.logger.clear_multiple_devices_output()
|
||||
@@ -5064,7 +5198,7 @@ class AMDSMICommands():
|
||||
gpu_metric_version_str = json.dumps(gpu_metric_version_info, indent=4)
|
||||
logging.debug("GPU Metrics table Version for GPU %s | %s", gpu_id, gpu_metric_version_str)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Unable to load GPU Metrics table version for GPU %s | %s", gpu_id, e.err_info)
|
||||
logging.debug("#4 - Unable to load GPU Metrics table version for %s | %s", gpu_id, e.err_info)
|
||||
|
||||
try:
|
||||
# Get GPU Metrics table
|
||||
@@ -5072,7 +5206,7 @@ class AMDSMICommands():
|
||||
gpu_metric_str = json.dumps(gpu_metric_debug_info, indent=4)
|
||||
logging.debug("GPU Metrics table for GPU %s | %s", gpu_id, str(gpu_metric_str))
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Unable to load GPU Metrics table for %s | %s", gpu_id, e.err_info)
|
||||
logging.debug("#5 - Unable to load GPU Metrics table for %s | %s", gpu_id, e.err_info)
|
||||
|
||||
# Store the pcie_bw values due to possible increase in bandwidth due to repeated gpu_metrics calls
|
||||
if args.pcie:
|
||||
@@ -5892,13 +6026,13 @@ class AMDSMICommands():
|
||||
gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu)
|
||||
try:
|
||||
partition_dict = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile(gpu)
|
||||
partition_id = str(partition_dict['partition_id']).replace("[", "").replace("]", "").replace(" ", "")
|
||||
profile_type = partition_dict['partition_profile']['profile_type']
|
||||
profile_index = partition_dict['partition_profile']['profile_index']
|
||||
partition_id = str(partition_dict['partition_id']).replace("[", "").replace("]", "").replace(" ", "")
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
profile_type = "N/A"
|
||||
profile_index = "N/A"
|
||||
partition_id = "0"
|
||||
partition_id = str(partition_dict['partition_id']).replace("[", "").replace("]", "").replace(" ", "")
|
||||
logging.debug("Failed to get accelerator partition profile for GPU %s | %s", gpu_id, e.get_error_info())
|
||||
try:
|
||||
current_mem_cap = amdsmi_interface.amdsmi_get_gpu_memory_partition(gpu)
|
||||
@@ -5975,7 +6109,7 @@ class AMDSMICommands():
|
||||
prev_gpu_id = "N/A"
|
||||
for gpu in args.gpu:
|
||||
gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu)
|
||||
tabular_output_dict = {"gpu_id": "N/A",
|
||||
tabular_output_dict = {"gpu_id": gpu_id,
|
||||
"profile_index": "N/A",
|
||||
"memory_partition_caps": "N/A",
|
||||
"accelerator_type": "N/A",
|
||||
@@ -5990,6 +6124,7 @@ class AMDSMICommands():
|
||||
partition_dict = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile(gpu)
|
||||
partition_id = str(partition_dict['partition_id']).replace("[", "").replace("]", "").replace(" ", "")
|
||||
current_accelerator_type = partition_dict['partition_profile']['profile_type']
|
||||
tabular_output_dict["partition_id"] = partition_id
|
||||
|
||||
# save only the primary GPU node's partition_id (the 1st listed device; non N/A one)
|
||||
# else keep current_partition_id unchanged for displaying in accelerator resource's output
|
||||
|
||||
@@ -741,12 +741,25 @@ class AMDSMIHelpers():
|
||||
accelerator_partition_profiles['memory_caps'].append(profile['profiles'][p]['memory_caps'])
|
||||
break # Only need to get the profiles for one device
|
||||
except amdsmi_interface.AmdSmiLibraryException as e:
|
||||
logging.debug(f"AMDSMIHelpers.get_accelerator_partition_profile_config - Unable to get accelerator partition profile config for device {dev}: {str(e)}")
|
||||
if e.err_code == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED:
|
||||
logging.debug(f"AMDSMIHelpers.get_accelerator_partition_profile_config - Device {dev} does not support accelerator partition profiles")
|
||||
return accelerator_partition_profiles
|
||||
break
|
||||
except Exception as e:
|
||||
logging.debug(f"AMDSMIHelpers.get_accelerator_partition_profile_config - Unexpected error occured --> Unable to get accelerator partition profile config for device {dev}: {str(e)}")
|
||||
break
|
||||
return accelerator_partition_profiles
|
||||
|
||||
|
||||
def get_accelerator_choices_types_indices(self):
|
||||
return_val = ("N/A", {'profile_indices':[], 'profile_types':[]})
|
||||
if os.geteuid() != 0:
|
||||
logging.debug("AMDSMIHelpers.get_accelerator_choices_types_indices - Not root, unable to get accelerator partition profiles")
|
||||
# If not root, we can't get the accelerator partition profiles
|
||||
return return_val
|
||||
else:
|
||||
logging.debug("AMDSMIHelpers.get_accelerator_choices_types_indices - Root, getting accelerator partition profiles")
|
||||
accelerator_partition_profiles = self.get_accelerator_partition_profile_config()
|
||||
if len(accelerator_partition_profiles['profile_types']) != 0:
|
||||
compute_partitions_str = accelerator_partition_profiles['profile_types'] + accelerator_partition_profiles['profile_indices']
|
||||
@@ -787,11 +800,15 @@ class AMDSMIHelpers():
|
||||
power_cap_min = amdsmi_interface.MaxUIntegerTypes.UINT64_T # start out at max and min and then find real min and max
|
||||
power_cap_max = 0
|
||||
for dev in device_handles:
|
||||
power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(dev)
|
||||
if power_cap_info['max_power_cap'] > power_cap_max:
|
||||
power_cap_max = power_cap_info['max_power_cap']
|
||||
if power_cap_info['min_power_cap'] < power_cap_max:
|
||||
power_cap_min = power_cap_info['min_power_cap']
|
||||
try:
|
||||
power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(dev)
|
||||
if power_cap_info['max_power_cap'] > power_cap_max:
|
||||
power_cap_max = power_cap_info['max_power_cap']
|
||||
if power_cap_info['min_power_cap'] < power_cap_max:
|
||||
power_cap_min = power_cap_info['min_power_cap']
|
||||
except amdsmi_interface.AmdSmiLibraryException as e:
|
||||
logging.debug(f"AMDSMIHelpers.get_power_caps - Unable to get power cap info for device {dev}: {str(e)}")
|
||||
continue
|
||||
return (power_cap_min, power_cap_max)
|
||||
|
||||
|
||||
|
||||
@@ -60,6 +60,8 @@ class AMDSmiGPUDevice: public AMDSmiProcessor {
|
||||
pthread_mutex_t* get_mutex();
|
||||
uint32_t get_gpu_id() const;
|
||||
uint32_t get_gpu_fd() const;
|
||||
uint32_t get_card_id(); // -e feature + we can get card_id for our internal functions
|
||||
uint32_t get_drm_render_minor(); // -e feature + we can get card_id for our internal functions
|
||||
std::string& get_gpu_path();
|
||||
amdsmi_bdf_t get_bdf();
|
||||
bool check_if_drm_is_supported() { return drm_.check_if_drm_is_supported(); }
|
||||
@@ -80,9 +82,7 @@ class AMDSmiGPUDevice: public AMDSmiProcessor {
|
||||
amdsmi_status_t amdgpu_query_driver_date(std::string& date) const;
|
||||
|
||||
// New methods for -e feature
|
||||
std::string bdf_to_string() const;
|
||||
uint32_t get_card_from_bdf() const;
|
||||
uint32_t get_render_id() const;
|
||||
std::string bdf_to_string() const; // -e feature
|
||||
|
||||
private:
|
||||
uint32_t gpu_id_;
|
||||
@@ -91,6 +91,8 @@ class AMDSmiGPUDevice: public AMDSmiProcessor {
|
||||
amdsmi_bdf_t bdf_;
|
||||
uint32_t vendor_id_;
|
||||
AMDSmiDrm& drm_;
|
||||
uint32_t card_index_;
|
||||
uint32_t drm_render_minor_;
|
||||
GPUComputeProcessList_t compute_process_list_;
|
||||
int32_t get_compute_process_list_impl(GPUComputeProcessList_t& compute_process_list,
|
||||
ComputeProcessListType_t list_type);
|
||||
|
||||
@@ -2173,13 +2173,17 @@ def amdsmi_get_clock_info(
|
||||
)
|
||||
)
|
||||
|
||||
return {
|
||||
"clk": clock_measure.clk,
|
||||
"min_clk": clock_measure.min_clk,
|
||||
"max_clk": clock_measure.max_clk,
|
||||
"clk_locked": clock_measure.clk_locked,
|
||||
"clk_deep_sleep" : clock_measure.clk_deep_sleep,
|
||||
clk_type_str = AmdSmiClkType(clock_type).name
|
||||
|
||||
dict_ret = {
|
||||
"clk": _validate_if_max_uint(clock_measure.clk, MaxUIntegerTypes.UINT32_T),
|
||||
"min_clk": _validate_if_max_uint(clock_measure.min_clk, MaxUIntegerTypes.UINT32_T),
|
||||
"max_clk": _validate_if_max_uint(clock_measure.max_clk, MaxUIntegerTypes.UINT32_T),
|
||||
"clk_locked": _validate_if_max_uint(clock_measure.clk_locked, MaxUIntegerTypes.UINT8_T, isBool=True),
|
||||
"clk_deep_sleep" : _validate_if_max_uint(clock_measure.clk_deep_sleep, MaxUIntegerTypes.UINT8_T, isBool=True),
|
||||
}
|
||||
logging.debug("amdsmi_interface.py | amdsmi_get_clock_info | clk_type = " + clk_type_str + " | return_dictionary = \n" + str(json.dumps(dict_ret, indent=4)))
|
||||
return dict_ret
|
||||
|
||||
|
||||
def amdsmi_get_gpu_bad_page_info(
|
||||
@@ -3129,51 +3133,76 @@ def amdsmi_get_gpu_accelerator_partition_profile(
|
||||
raise AmdSmiParameterException(
|
||||
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
|
||||
)
|
||||
exception_caught = False
|
||||
length = 8
|
||||
partition_id = [0, 0, 0, 0, 0, 0, 0, 0]
|
||||
partition_id_list = (ctypes.c_uint32 * length)(*partition_id)
|
||||
profile = amdsmi_wrapper.amdsmi_accelerator_partition_profile_t()
|
||||
|
||||
_check_res(
|
||||
amdsmi_wrapper.amdsmi_get_gpu_accelerator_partition_profile(processor_handle,
|
||||
ctypes.byref(profile), partition_id_list)
|
||||
)
|
||||
profile_type_ret = amdsmi_wrapper.amdsmi_accelerator_partition_type_t__enumvalues[profile.profile_type].replace("AMDSMI_ACCELERATOR_PARTITION_", "")
|
||||
profile_type_ret = profile_type_ret.replace("INVALID", "N/A")
|
||||
|
||||
length = profile.num_partitions
|
||||
partition_ids = []
|
||||
|
||||
#partition_id[0] will contain the partition id of each device
|
||||
#BM/Guest will include this logic. Host will only display primary partition ids.
|
||||
kPOSITION_OF_PARTITION_ID = 0
|
||||
partition_ids.append(partition_id_list[kPOSITION_OF_PARTITION_ID])
|
||||
|
||||
mem_caps_list = []
|
||||
if profile.memory_caps.nps_flags.nps1_cap == 1:
|
||||
mem_caps_list.append("NPS1")
|
||||
if profile.memory_caps.nps_flags.nps2_cap == 1:
|
||||
mem_caps_list.append("NPS2")
|
||||
if profile.memory_caps.nps_flags.nps4_cap == 1:
|
||||
mem_caps_list.append("NPS4")
|
||||
if profile.memory_caps.nps_flags.nps8_cap == 1:
|
||||
mem_caps_list.append("NPS8")
|
||||
ret = amdsmi_wrapper.amdsmi_get_gpu_accelerator_partition_profile(processor_handle,
|
||||
ctypes.byref(profile), partition_id_list)
|
||||
if ret == amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED:
|
||||
#partition_id[0] will contain the partition id of each device
|
||||
#BM/Guest will include this logic. Host will only display primary partition ids.
|
||||
partition_ids.append(partition_id_list[kPOSITION_OF_PARTITION_ID])
|
||||
|
||||
partition_profile_dict = {
|
||||
"profile_type" : profile_type_ret,
|
||||
"num_partitions" : profile.num_partitions,
|
||||
"profile_index" : profile.profile_index,
|
||||
"memory_caps": mem_caps_list,
|
||||
"num_resources" : profile.num_resources,
|
||||
"resources" : "N/A"
|
||||
}
|
||||
return_dictionary = {
|
||||
"partition_id" : partition_ids,
|
||||
"partition_profile" : partition_profile_dict
|
||||
}
|
||||
|
||||
logging.debug("amdsmi_interface.py | amdsmi_get_gpu_accelerator_partition_profile | return_dictionary = \n" + str(json.dumps(return_dictionary, indent=4)))
|
||||
return return_dictionary
|
||||
try:
|
||||
_check_res(ret)
|
||||
except AmdSmiException as e:
|
||||
logging.debug("amdsmi_interface.py | amdsmi_get_gpu_accelerator_partition_profile | exception_caught >> " + str(e))
|
||||
partition_profile_dict = {
|
||||
"profile_type" : "N/A",
|
||||
"num_partitions" : "N/A",
|
||||
"profile_index" : "N/A",
|
||||
"memory_caps": "N/A",
|
||||
"num_resources" : "N/A",
|
||||
"resources" : "N/A"
|
||||
}
|
||||
return_dictionary = {
|
||||
"partition_id" : partition_ids,
|
||||
"partition_profile" : partition_profile_dict
|
||||
}
|
||||
if ret == amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED:
|
||||
exception_caught = True
|
||||
else:
|
||||
_check_res(ret) # re-raise the exception if error is anything other than AMDSMI_STATUS_NOT_SUPPORTED
|
||||
# this ensures we can get partition ID even if the profile is not supported.
|
||||
finally:
|
||||
if exception_caught:
|
||||
logging.debug("amdsmi_interface.py | exception_caught >> amdsmi_get_gpu_accelerator_partition_profile | return_dictionary = \n" + str(json.dumps(return_dictionary, indent=4)))
|
||||
return return_dictionary
|
||||
else:
|
||||
profile_type_ret = amdsmi_wrapper.amdsmi_accelerator_partition_type_t__enumvalues[profile.profile_type].replace("AMDSMI_ACCELERATOR_PARTITION_", "")
|
||||
profile_type_ret = profile_type_ret.replace("INVALID", "N/A")
|
||||
length = profile.num_partitions
|
||||
#partition_id[0] will contain the partition id of each device
|
||||
#BM/Guest will include this logic. Host will only display primary partition ids.
|
||||
partition_ids.append(partition_id_list[kPOSITION_OF_PARTITION_ID])
|
||||
mem_caps_list = []
|
||||
if profile.memory_caps.nps_flags.nps1_cap == 1:
|
||||
mem_caps_list.append("NPS1")
|
||||
if profile.memory_caps.nps_flags.nps2_cap == 1:
|
||||
mem_caps_list.append("NPS2")
|
||||
if profile.memory_caps.nps_flags.nps4_cap == 1:
|
||||
mem_caps_list.append("NPS4")
|
||||
if profile.memory_caps.nps_flags.nps8_cap == 1:
|
||||
mem_caps_list.append("NPS8")
|
||||
partition_profile_dict = {
|
||||
"profile_type" : profile_type_ret,
|
||||
"num_partitions" : profile.num_partitions,
|
||||
"profile_index" : profile.profile_index,
|
||||
"memory_caps": mem_caps_list,
|
||||
"num_resources" : profile.num_resources,
|
||||
"resources" : "N/A"
|
||||
}
|
||||
return_dictionary = {
|
||||
"partition_id" : partition_ids,
|
||||
"partition_profile" : partition_profile_dict
|
||||
}
|
||||
logging.debug("amdsmi_interface.py | amdsmi_get_gpu_accelerator_partition_profile | return_dictionary = \n" + str(json.dumps(return_dictionary, indent=4)))
|
||||
return return_dictionary
|
||||
|
||||
def amdsmi_get_gpu_accelerator_partition_profile_config(processor_handle: amdsmi_wrapper.amdsmi_processor_handle) -> Dict:
|
||||
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
|
||||
@@ -4131,11 +4160,14 @@ def amdsmi_get_clk_freq(
|
||||
)
|
||||
)
|
||||
|
||||
clk_type_str = AmdSmiClkType(clk_type).name
|
||||
|
||||
dict_ret = {
|
||||
"num_supported": freq.num_supported,
|
||||
"current": freq.current,
|
||||
"frequency": list(freq.frequency)[: freq.num_supported],
|
||||
}
|
||||
logging.debug("amdsmi_interface.py | amdsmi_get_clk_freq | clk_type = " + clk_type_str + " | return_dictionary = \n" + str(json.dumps(dict_ret, indent=4)))
|
||||
return dict_ret
|
||||
|
||||
|
||||
|
||||
@@ -1445,6 +1445,35 @@ typedef union id {
|
||||
};
|
||||
} rsmi_func_id_value_t;
|
||||
|
||||
/**
|
||||
* @struct rsmi_device_identifiers_t
|
||||
* @brief Structure to hold various identifiers for a GPU device.
|
||||
*
|
||||
* @details This structure contains fields that uniquely identify a GPU device,
|
||||
* including its card index, DRM render minor, PCI Bus/Device/Function ID (BDFID),
|
||||
* KFD GPU ID, partition ID, and SMI device ID.
|
||||
*/
|
||||
typedef struct {
|
||||
//!< The card index of the device.
|
||||
uint32_t card_index;
|
||||
//!< The DRM render minor number of the device.
|
||||
uint32_t drm_render_minor;
|
||||
|
||||
//!< The PCI Bus/Device/Function identifier (BDFID) of the device.
|
||||
uint64_t bdfid;
|
||||
|
||||
//!< The KFD (Kernel Fusion Driver) GPU ID of the device.
|
||||
uint64_t kfd_gpu_id;
|
||||
|
||||
//!< The partition ID of the device.
|
||||
uint32_t partition_id;
|
||||
|
||||
//!< The SMI (System Management Interface) device ID.
|
||||
uint32_t smi_device_id;
|
||||
|
||||
uint32_t reserved[10];
|
||||
} rsmi_device_identifiers_t;
|
||||
|
||||
|
||||
/*****************************************************************************/
|
||||
/** @defgroup InitShutAdmin Initialization and Shutdown
|
||||
@@ -2009,6 +2038,36 @@ rsmi_status_t rsmi_dev_guid_get(uint32_t dv_ind, uint64_t *guid);
|
||||
*/
|
||||
rsmi_status_t rsmi_dev_node_id_get(uint32_t dv_ind, uint32_t *node_id);
|
||||
|
||||
/**
|
||||
* @brief Retrieves the device identifiers for a specific GPU device.
|
||||
*
|
||||
* @details This function retrieves various identifiers for a GPU device, such as
|
||||
* the card index, DRM render minor, BDFID, KFD GPU ID, partition ID, and SMI device ID.
|
||||
* The identifiers are written to the provided `rsmi_device_identifiers_t` structure.
|
||||
*
|
||||
* @param[in] dv_ind a device index.
|
||||
*
|
||||
* @param[out] identifiers A pointer to a structure of type `rsmi_device_identifiers_t`
|
||||
* where the device identifiers will be stored. The structure
|
||||
* contains fields such as:
|
||||
* - `card_index`: The card index of the device.
|
||||
* - `drm_render_minor`: The DRM render minor number.
|
||||
* - `bdfid`: The Bus/Device/Function PCI identifier.
|
||||
* - `kfd_gpu_id`: The KFD GPU ID.
|
||||
* - `partition_id`: The partition ID of the device.
|
||||
* - `smi_device_id`: The SMI device ID.
|
||||
*
|
||||
* @retval ::RSMI_STATUS_SUCCESS The call was successful, and the device identifiers were retrieved.
|
||||
* @retval ::RSMI_STATUS_NOT_SUPPORTED The installed software or hardware does not support this function
|
||||
* with the given arguments.
|
||||
* @retval ::RSMI_STATUS_INVALID_ARGS The provided arguments are invalid.
|
||||
*
|
||||
* @note Ensure that the `identifiers` pointer is valid and points to a properly allocated structure
|
||||
* before calling this function.
|
||||
*/
|
||||
rsmi_status_t rsmi_dev_device_identifiers_get(uint32_t dv_ind,
|
||||
rsmi_device_identifiers_t *identifiers);
|
||||
|
||||
|
||||
/** @} */ // end of IDQuer
|
||||
|
||||
|
||||
@@ -200,14 +200,6 @@ class Device {
|
||||
public:
|
||||
explicit Device(std::string path, RocmSMI_env_vars const *e);
|
||||
~Device(void);
|
||||
typedef struct {
|
||||
uint32_t card_index;
|
||||
uint32_t drm_render_minor;
|
||||
uint64_t bdfid;
|
||||
uint64_t kfd_gpu_id;
|
||||
uint32_t partition_id;
|
||||
uint32_t smi_device_id;
|
||||
} rsmi_device_identifiers_t;
|
||||
|
||||
void set_monitor(std::shared_ptr<Monitor> m) {monitor_ = m;}
|
||||
std::string path(void) const {return path_;}
|
||||
|
||||
@@ -33,6 +33,7 @@
|
||||
#include <map>
|
||||
#include <mutex> // NOLINT
|
||||
#include <utility>
|
||||
#include <limits>
|
||||
|
||||
#include "rocm_smi/rocm_smi_io_link.h"
|
||||
#include "rocm_smi/rocm_smi_kfd.h"
|
||||
@@ -109,6 +110,15 @@ class RocmSMI {
|
||||
io_link_map_;
|
||||
std::map<uint32_t, uint32_t> dev_ind_to_node_ind_map_;
|
||||
void AddToDeviceList(std::string dev_name, uint64_t bdfid = 0);
|
||||
typedef struct {
|
||||
uint32_t card_index = std::numeric_limits<uint32_t>::max();
|
||||
std::string dev_name = "";
|
||||
std::string drm_render_path = "";
|
||||
std::string drm_card_path = "";
|
||||
uint32_t drm_render_minor = std::numeric_limits<uint32_t>::max();
|
||||
uint64_t bdfid = std::numeric_limits<uint64_t>::max();
|
||||
} rsmi_device_enumeration_t;
|
||||
rsmi_status_t AddToDeviceList2(rsmi_device_enumeration_t device);
|
||||
void GetEnvVariables(void);
|
||||
std::shared_ptr<Monitor> FindMonitor(std::string monitor_path);
|
||||
|
||||
|
||||
@@ -6569,8 +6569,10 @@ rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id) {
|
||||
std::string strCompPartition = "UNKNOWN";
|
||||
const uint32_t PARTITION_LEN = 10;
|
||||
char compute_partition[PARTITION_LEN];
|
||||
compute_partition[0] = '\0';
|
||||
rsmi_status_t ret = rsmi_dev_compute_partition_get(dv_ind, compute_partition, PARTITION_LEN);
|
||||
if (ret == RSMI_STATUS_SUCCESS) {
|
||||
strCompPartition.clear();
|
||||
strCompPartition = compute_partition;
|
||||
}
|
||||
uint64_t pci_id = UINT64_MAX;
|
||||
@@ -6583,11 +6585,12 @@ rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id) {
|
||||
bdf_sstream << std::hex << std::setfill('0') << std::setw(4)
|
||||
<< ((pci_id >> 32) & 0xFFFFFFFF) << ":";
|
||||
bdf_sstream << std::hex << std::setfill('0') << std::setw(2) << ((pci_id >> 8) & 0xFF) << ":";
|
||||
bdf_sstream << std::hex << std::setfill('0') << std::setw(2) << ((pci_id >> 3) & 0xF8) << ".";
|
||||
bdf_sstream << std::hex << std::setfill('0') << std::setw(2) << ((pci_id >> 3) & 0x1F) << ".";
|
||||
bdf_sstream << std::hex << std::setfill('0') << +(pci_id & 0x7);
|
||||
bdf_sstream << "\nPartition ID ((pci_id >> 28) & 0xf): " << std::dec
|
||||
bdf_sstream << "\n[Option 1] Partition ID ((pci_id >> 28) & 0xf): " << std::dec
|
||||
<< static_cast<int>((pci_id >> 28) & 0xf);
|
||||
bdf_sstream << "\nPartition ID (pci_id & 0x7): " << std::dec << static_cast<int>(pci_id & 0x7);
|
||||
bdf_sstream << "\n[Option 2] Partition ID (pci_id & 0x7): " << std::dec
|
||||
<< static_cast<int>(pci_id & 0x7);
|
||||
// std::cout << __PRETTY_FUNCTION__ << " BDF: " << bdf_sstream.str() << std::endl;
|
||||
|
||||
/**
|
||||
@@ -6605,15 +6608,18 @@ rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id) {
|
||||
* bits [7:3] = Device
|
||||
* bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes
|
||||
*/
|
||||
|
||||
// If the partition_id is still not set (bits [31:28]), we will use the fallback
|
||||
// in function bits. We will use bits [2:0] as the partition ID.
|
||||
if (*partition_id != UINT32_MAX && *partition_id == 0 &&
|
||||
(strCompPartition == "DPX" || strCompPartition == "TPX"
|
||||
|| strCompPartition == "CPX" || strCompPartition == "QPX")) {
|
||||
static_cast<uint32_t>(pci_id & 0x7) != 0) {
|
||||
*partition_id = static_cast<uint32_t>(pci_id & 0x7);
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Success"
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Compute Partition: " << strCompPartition
|
||||
<< " | Type: partition_id"
|
||||
<< " | Data: " << static_cast<int>(*partition_id)
|
||||
<< " | Returning = "
|
||||
@@ -7487,6 +7493,21 @@ rsmi_dev_metrics_log_get(uint32_t dv_ind)
|
||||
CATCH
|
||||
}
|
||||
|
||||
rsmi_status_t rsmi_dev_device_identifiers_get(uint32_t dv_ind,
|
||||
rsmi_device_identifiers_t *smi_device_identifiers) {
|
||||
TRY
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << "| ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
GET_DEV_FROM_INDX
|
||||
if (smi_device_identifiers == nullptr) {
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
rsmi_status_t ret = RSMI_STATUS_NOT_SUPPORTED;
|
||||
return ret = dev->get_smi_device_identifiers(dv_ind, smi_device_identifiers);
|
||||
CATCH
|
||||
}
|
||||
|
||||
|
||||
// UNDOCUMENTED FUNCTIONS
|
||||
// This functions are not declared in rocm_smi.h. They are either not fully
|
||||
|
||||
@@ -1809,6 +1809,7 @@ std::string Device::readBootPartitionState<rsmi_memory_partition_type_t>(
|
||||
rsmi_status_t Device::get_smi_device_identifiers(uint32_t device_id,
|
||||
rsmi_device_identifiers_t *device_identifiers) {
|
||||
bool found_device = false;
|
||||
std::ostringstream ss;
|
||||
rsmi_status_t ret = RSMI_STATUS_NOT_SUPPORTED;
|
||||
if (device_identifiers == nullptr) {
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
@@ -1816,20 +1817,38 @@ rsmi_status_t Device::get_smi_device_identifiers(uint32_t device_id,
|
||||
|
||||
amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
|
||||
auto devices = smi.devices();
|
||||
ss << __PRETTY_FUNCTION__ << " | device_id = " << device_id
|
||||
<< "; devices.size() = " << devices.size();
|
||||
// std::cout << ss.str() << "\n";
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
for (uint32_t i = 0; i < devices.size(); i++) {
|
||||
if (i != device_id) {
|
||||
continue;
|
||||
}
|
||||
rsmi_device_identifiers_t smi_device;
|
||||
smi_device.card_index = devices[i]->index();
|
||||
smi_device.drm_render_minor = devices[i]->drm_render_minor();
|
||||
smi_device.bdfid = devices[i]->bdfid();
|
||||
smi_device.kfd_gpu_id = devices[i]->kfd_gpu_id();
|
||||
smi_device.partition_id = devices[i]->m_partition_id;
|
||||
smi_device.smi_device_id = i;
|
||||
*device_identifiers = smi_device;
|
||||
|
||||
device_identifiers->card_index = devices[i]->index();
|
||||
device_identifiers->drm_render_minor = devices[i]->drm_render_minor();
|
||||
device_identifiers->bdfid = devices[i]->bdfid();
|
||||
device_identifiers->kfd_gpu_id = devices[i]->kfd_gpu_id();
|
||||
uint32_t temp_partition_id = 0;
|
||||
rsmi_status_t ret = rsmi_dev_partition_id_get(
|
||||
i, &temp_partition_id);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
temp_partition_id = 0;
|
||||
}
|
||||
device_identifiers->partition_id = temp_partition_id;
|
||||
device_identifiers->smi_device_id = i;
|
||||
found_device = true;
|
||||
ss << __PRETTY_FUNCTION__ << " | Found device: "
|
||||
<< "card_index = " << device_identifiers->card_index
|
||||
<< "; drm_render_minor = " << device_identifiers->drm_render_minor
|
||||
<< "; bdfid = " << std::hex << "0x" << device_identifiers->bdfid
|
||||
<< "; kfd_gpu_id = " << std::dec << device_identifiers->kfd_gpu_id
|
||||
<< "; partition_id = " << device_identifiers->partition_id
|
||||
<< "; smi_device_id = " << device_identifiers->smi_device_id;
|
||||
// std::cout << ss.str() << "\n";
|
||||
LOG_DEBUG(ss);
|
||||
break;
|
||||
}
|
||||
if (found_device) {
|
||||
|
||||
@@ -4570,8 +4570,13 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t* smu) {
|
||||
|
||||
dev->set_smi_device_id(dv_ind);
|
||||
uint32_t partition_id = 0;
|
||||
rsmi_dev_partition_id_get(dv_ind, &partition_id);
|
||||
dev->set_smi_partition_id(partition_id);
|
||||
auto ret = rsmi_dev_partition_id_get(dv_ind, &partition_id);
|
||||
if (ret == RSMI_STATUS_SUCCESS) {
|
||||
dev->set_smi_partition_id(partition_id);
|
||||
} else {
|
||||
dev->set_smi_partition_id(0);
|
||||
}
|
||||
|
||||
dev->dev_log_gpu_metrics(ostrstream);
|
||||
|
||||
const auto [error_code, external_metrics] = dev->dev_copy_internal_to_external_metrics();
|
||||
|
||||
@@ -53,8 +53,6 @@ static const char *kPathDRMRoot = "/sys/class/drm";
|
||||
static const char *kPathHWMonRoot = "/sys/class/hwmon";
|
||||
static const char *kPathPowerRoot = "/sys/kernel/debug/dri";
|
||||
|
||||
static const char *kDeviceNamePrefix = "card";
|
||||
|
||||
static const char *kAMDMonitorTypes[] = {"radeon", "amdgpu", ""};
|
||||
|
||||
namespace amd {
|
||||
@@ -107,6 +105,44 @@ static uint32_t GetDrmRenderMinor(const std::string s) {
|
||||
return static_cast<uint32_t>(drm_minor);
|
||||
}
|
||||
|
||||
// Find the drm minor from from sysfs path "/sys/class/drm/renderDX/device/drm".
|
||||
// From the directory cardN in that sysfs path, the card number can be
|
||||
// computed for renderDX.
|
||||
// On success, return drm_minor which is >= 128 otherwise return 0xFFFFFFFF
|
||||
static uint32_t GetCard(const std::string s) {
|
||||
std::ostringstream ss;
|
||||
std::string drm_path = s;
|
||||
int card_num = -1;
|
||||
const std::string card_file_prefix = "card";
|
||||
const uint64_t prefix_size = card_file_prefix.size();
|
||||
drm_path += "/device/drm";
|
||||
|
||||
auto card_dir = opendir(drm_path.c_str());
|
||||
if (card_dir == nullptr)
|
||||
return static_cast<uint32_t>(-1);
|
||||
|
||||
auto dentry = readdir(card_dir);
|
||||
|
||||
while (dentry != nullptr) {
|
||||
std::string card_file = dentry->d_name;
|
||||
if (!card_file.compare(0, prefix_size, card_file_prefix)) {
|
||||
card_num = stoi(card_file.substr(prefix_size));
|
||||
if (card_num)
|
||||
break;
|
||||
}
|
||||
dentry = readdir(card_dir);
|
||||
}
|
||||
|
||||
if (closedir(card_dir)) {
|
||||
return static_cast<uint32_t>(-1);
|
||||
}
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << " | Discovered card = "
|
||||
<< std::to_string(card_num) << " | For drm_path = " << drm_path << " | ";
|
||||
LOG_DEBUG(ss);
|
||||
return static_cast<uint32_t>(card_num);
|
||||
}
|
||||
|
||||
// Determine if provided string is a bdfid pci path directory of the form
|
||||
// XXXX:XX:XX.X,
|
||||
// domain:bus:device.function
|
||||
@@ -170,12 +206,13 @@ static bool bdfid_from_path(const std::string in_name, uint64_t *bdfid) {
|
||||
|
||||
// 0 = successful bdfid found
|
||||
// 1 = not a good bdfid found
|
||||
static uint32_t ConstructBDFID(std::string path, uint64_t *bdfid) {
|
||||
[[maybe_unused]] static uint32_t ConstructBDFID(std::string path, uint64_t *bdfid) {
|
||||
std::ostringstream ss;
|
||||
assert(bdfid != nullptr);
|
||||
const unsigned int MAX_BDF_LENGTH = 512;
|
||||
char tpath[MAX_BDF_LENGTH] = {'\0'};
|
||||
ssize_t ret;
|
||||
memset(tpath,0,MAX_BDF_LENGTH);
|
||||
memset(tpath, 0, MAX_BDF_LENGTH);
|
||||
|
||||
ret = readlink(path.c_str(), tpath, MAX_BDF_LENGTH);
|
||||
|
||||
@@ -183,6 +220,12 @@ static uint32_t ConstructBDFID(std::string path, uint64_t *bdfid) {
|
||||
assert(ret < MAX_BDF_LENGTH);
|
||||
|
||||
if (ret <= 0 || ret >= MAX_BDF_LENGTH) {
|
||||
ss << __PRETTY_FUNCTION__ << " | readlink failed for path = "
|
||||
<< path << " | ret = " << ret
|
||||
<< " | errno = " << errno
|
||||
<< " | error = " << strerror(errno);
|
||||
// std::cout << ss.str() << std::endl;
|
||||
LOG_ERROR(ss);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -200,11 +243,19 @@ static uint32_t ConstructBDFID(std::string path, uint64_t *bdfid) {
|
||||
tmp = tpath_str.substr(slash_i + 1, end_i - slash_i);
|
||||
|
||||
if (bdfid_from_path(tmp, bdfid)) {
|
||||
ss << __PRETTY_FUNCTION__ << " | Found bdfid = "
|
||||
<< print_int_as_hex(*bdfid, true, 8) << " | from path = "
|
||||
<< path << " | tmp = " << tmp << std::endl;
|
||||
LOG_INFO(ss);
|
||||
return 0;
|
||||
}
|
||||
end_i = slash_i - 1;
|
||||
}
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << " | No valid bdfid found in path = "
|
||||
<< path << " | tpath = " << tpath
|
||||
<< " | errno = " << errno
|
||||
<< " | error = " << strerror(errno) << std::endl;
|
||||
LOG_ERROR(ss);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -253,41 +304,8 @@ RocmSMI::Initialize(uint64_t flags) {
|
||||
"DiscoverAmdgpuDevices() failed.");
|
||||
}
|
||||
|
||||
uint64_t bdfid;
|
||||
for (auto & device : devices_) {
|
||||
if (ConstructBDFID(device->path(), &bdfid) != 0) {
|
||||
std::cerr << "Failed to construct BDFID." << std::endl;
|
||||
ret = 1;
|
||||
} else if (device->bdfid() != UINT64_MAX && device->bdfid() != bdfid) {
|
||||
// handles secondary partitions - compute partition feature nodes
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | [before] device->path() = " << device->path()
|
||||
<< "\n | bdfid = " << bdfid
|
||||
<< "\n | device->bdfid() = " << device->bdfid()
|
||||
<< " (" << print_int_as_hex(device->bdfid()) << ")"
|
||||
<< "\n | (xgmi node) setting to setting "
|
||||
<< "device->set_bdfid(device->bdfid())";
|
||||
LOG_TRACE(ss);
|
||||
device->set_bdfid(device->bdfid());
|
||||
} else {
|
||||
// legacy & pcie card updates
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | [before] device->path() = " << device->path()
|
||||
<< "\n | bdfid = " << bdfid
|
||||
<< "\n | device->bdfid() = " << device->bdfid()
|
||||
<< " (" << print_int_as_hex(device->bdfid()) << ")"
|
||||
<< "\n | (legacy/pcie card) setting device->set_bdfid(bdfid)";
|
||||
LOG_TRACE(ss);
|
||||
device->set_bdfid(bdfid);
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | [after] device->path() = " << device->path()
|
||||
<< "\n | bdfid = " << bdfid
|
||||
<< "\n | device->bdfid() = " << device->bdfid()
|
||||
<< " (" << print_int_as_hex(device->bdfid()) << ")"
|
||||
<< "\n | final update: device->bdfid() holds correct device bdf";
|
||||
LOG_TRACE(ss);
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__ << " | about to sort by BDF..." << std::endl;
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
std::shared_ptr<amd::smi::Device> dev;
|
||||
// Sort index based on the BDF, collect BDF id firstly.
|
||||
@@ -382,6 +400,7 @@ RocmSMI::Initialize(uint64_t flags) {
|
||||
// displayAppTmpFilesContent();
|
||||
std::string amdGPUDeviceList = displayAllDevicePaths(devices_);
|
||||
ss << __PRETTY_FUNCTION__ << " | current device paths = " << amdGPUDeviceList;
|
||||
// std::cout << ss.str() << std::endl;
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
|
||||
@@ -623,9 +642,11 @@ RocmSMI::FindMonitor(std::string monitor_path) {
|
||||
}
|
||||
|
||||
void RocmSMI::AddToDeviceList(std::string dev_name, uint64_t bdfid) {
|
||||
static const int BYTE = 8;
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
auto dev_path = std::string(kPathDRMRoot);
|
||||
dev_path += "/";
|
||||
dev_path += dev_name;
|
||||
@@ -637,7 +658,8 @@ void RocmSMI::AddToDeviceList(std::string dev_name, uint64_t bdfid) {
|
||||
|
||||
const std::string& d_name = dev_name;
|
||||
uint32_t card_indx = GetDeviceIndex(d_name);
|
||||
dev->set_drm_render_minor(GetDrmRenderMinor(dev_path));
|
||||
uint32_t drmRenderMinor = GetDrmRenderMinor(dev_path);
|
||||
dev->set_drm_render_minor(drmRenderMinor);
|
||||
dev->set_card_index(card_indx);
|
||||
GetSupportedEventGroups(card_indx, dev->supported_event_groups());
|
||||
if (bdfid != 0) {
|
||||
@@ -646,16 +668,120 @@ void RocmSMI::AddToDeviceList(std::string dev_name, uint64_t bdfid) {
|
||||
|
||||
devices_.push_back(dev);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Adding to device list dev_name = " << dev_name
|
||||
<< " | path = " << dev_path
|
||||
<< " | bdfid = " << bdfid
|
||||
<< " | card index = " << std::to_string(card_indx) << " | ";
|
||||
<< " | Adding to device list dev_name = " << dev_name << "\n"
|
||||
<< " | path = " << dev_path << "\n"
|
||||
<< " | dName = " << d_name << "\n"
|
||||
<< " | bdfid = " << (bdfid == UINT64_MAX ?
|
||||
"N/A" : print_int_as_hex(bdfid, true, 2*BYTE)) << "\n"
|
||||
<< " | card index = " << std::to_string(card_indx) << "\n"
|
||||
<< " | drmRenderMinor = " << std::to_string(drmRenderMinor) << "\n"
|
||||
<< " | supported_event_groups = " << dev->supported_event_groups() << "\n";
|
||||
// std::cout << ss.str() << std::endl;
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
|
||||
// AddToDeviceList2 is used to add a device to the device list.
|
||||
// [precondition] a. Iterate through KFD to find all accessible devices.
|
||||
// [precondition] b. Provide BDFID of the device & the device path (card or render path)
|
||||
// 1. Provide to function:
|
||||
// [optional; Will populate] rsmi_device_enumeration_t->card_index
|
||||
// [optional; Will populate
|
||||
// if card or render path provided] rsmi_device_enumeration_t->dev_name
|
||||
// [optional; Will populate] rsmi_device_enumeration_t->drm_render_path
|
||||
// [optional; Will populate] rsmi_device_enumeration_t->drm_card_path
|
||||
// [optional; Will populate] rsmi_device_enumeration_t->drm_render_minor
|
||||
// [Required] rsmi_device_enumeration_t->bdfid
|
||||
rsmi_status_t RocmSMI::AddToDeviceList2(RocmSMI::rsmi_device_enumeration_t device) {
|
||||
static const int BYTE = 8;
|
||||
std::ostringstream ss;
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= start ======="
|
||||
<< "\n | card index = [" << std::to_string(device.card_index) << "]\n"
|
||||
<< " | dev_name = [" << device.dev_name << "]\n"
|
||||
<< " | drm_render_path = [" << device.drm_render_path << "]\n"
|
||||
<< " | drm_card_path = [" << device.drm_card_path << "]\n"
|
||||
<< " | drm_render_minor = [" << std::to_string(device.drm_render_minor)
|
||||
<< "]\n | bdfid (value) = [" << (device.bdfid == UINT64_MAX ?
|
||||
"N/A" : print_int_as_hex(device.bdfid, true, 4*BYTE)) << "]\n"
|
||||
<< " | bdfid (str) = ["
|
||||
<< std::hex << std::setfill('0') << std::setw(4)
|
||||
<< ((device.bdfid >> 32) & static_cast<uint64_t>(0xFFFFFFFF)) << ":"
|
||||
<< std::hex << std::setfill('0') << std::setw(2) << ((device.bdfid >> 8)
|
||||
& static_cast<uint64_t>(0xFF)) << ":"
|
||||
<< std::hex << std::setfill('0') << std::setw(2) << ((device.bdfid >> 3)
|
||||
& static_cast<uint64_t>(0x1F)) << "."
|
||||
<< std::hex << std::setfill('0') << std::setw(1) << +(device.bdfid
|
||||
& static_cast<uint64_t>(0x7)) << "]\n";
|
||||
// std::cout << ss.str() << std::endl;
|
||||
LOG_TRACE(ss);
|
||||
auto dev_path = std::string(kPathDRMRoot);
|
||||
|
||||
if (device.dev_name.empty()) {
|
||||
ss << __PRETTY_FUNCTION__ << " | dev_name is empty";
|
||||
// std::cout << ss.str() << std::endl;
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
dev_path += "/";
|
||||
dev_path += ("renderD" + std::to_string(device.drm_render_minor));
|
||||
uint32_t card_num = GetCard(dev_path);
|
||||
device.dev_name = "card" + std::to_string(card_num);
|
||||
device.drm_render_path = dev_path;
|
||||
device.drm_card_path = std::string(kPathDRMRoot) + "/card" +
|
||||
std::to_string(card_num);
|
||||
device.card_index = card_num;
|
||||
}
|
||||
|
||||
auto dev = std::make_shared<Device>(dev_path, &env_vars_);
|
||||
|
||||
std::shared_ptr<Monitor> m = FindMonitor(dev_path + "/device/hwmon");
|
||||
dev->set_monitor(m);
|
||||
|
||||
const std::string& d_name = device.dev_name;
|
||||
uint32_t card_indx = GetDeviceIndex(d_name);
|
||||
uint32_t drmRenderMinor = GetDrmRenderMinor(dev_path);
|
||||
dev->set_drm_render_minor(drmRenderMinor);
|
||||
dev->set_card_index(card_indx);
|
||||
GetSupportedEventGroups(card_indx, dev->supported_event_groups());
|
||||
if (device.bdfid != 0) {
|
||||
dev->set_bdfid(device.bdfid);
|
||||
}
|
||||
|
||||
devices_.push_back(dev);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Adding to device list dev_name = " << device.dev_name << "\n"
|
||||
<< " | path = " << dev_path << "\n"
|
||||
<< " | dName = " << d_name << "\n"
|
||||
<< " | bdfid = " << (device.bdfid == UINT64_MAX ?
|
||||
"N/A" : print_int_as_hex(device.bdfid, true, 8*BYTE)) << "\n"
|
||||
<< " | card index = " << std::to_string(card_indx) << "\n"
|
||||
<< " | drmRenderMinor = " << std::to_string(drmRenderMinor) << "\n"
|
||||
<< " | supported_event_groups = " << dev->supported_event_groups() << "\n";
|
||||
ss << " | ======= rsmi_device_enumeration_t details =======\n"
|
||||
<< " | card index = [" << std::to_string(device.card_index) << "]\n"
|
||||
<< " | dev_name = [" << device.dev_name << "]\n"
|
||||
<< " | drm_render_path = [" << device.drm_render_path << "]\n"
|
||||
<< " | drm_card_path = [" << device.drm_card_path << "]\n"
|
||||
<< " | drm_render_minor = [" << std::to_string(device.drm_render_minor)
|
||||
<< "]\n | bdfid (value) = [" << (device.bdfid == UINT64_MAX ?
|
||||
"N/A" : print_int_as_hex(device.bdfid, true, 8*BYTE)) << "]\n"
|
||||
<< " | bdfid (str) = ["
|
||||
<< std::hex << std::setfill('0') << std::setw(4)
|
||||
<< ((device.bdfid >> 32) & static_cast<uint64_t>(0xFFFFFFFF)) << ":"
|
||||
<< std::hex << std::setfill('0') << std::setw(2) << ((device.bdfid >> 8)
|
||||
& static_cast<uint64_t>(0xFF)) << ":"
|
||||
<< std::hex << std::setfill('0') << std::setw(2) << ((device.bdfid >> 3)
|
||||
& static_cast<uint64_t>(0x1F)) << "."
|
||||
<< std::hex << std::setfill('0') << std::setw(1) << +(device.bdfid
|
||||
& static_cast<uint64_t>(0x7)) << "]\n"
|
||||
<< " | END";
|
||||
// std::cout << ss.str() << std::endl;
|
||||
LOG_DEBUG(ss);
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static const uint32_t kAmdGpuId = 0x1002;
|
||||
|
||||
static bool isAMDGPU(std::string dev_path) {
|
||||
[[maybe_unused]] static bool isAMDGPU(std::string dev_path) {
|
||||
bool isAmdGpu = false;
|
||||
std::ostringstream ss;
|
||||
std::string vend_path = dev_path + "/device/vendor";
|
||||
@@ -691,44 +817,73 @@ static bool isAMDGPU(std::string dev_path) {
|
||||
return isAmdGpu;
|
||||
}
|
||||
|
||||
uint32_t GetLargestNodeNumber(const std::string& path = "/sys/class/kfd/kfd/topology/nodes/") {
|
||||
std::ostringstream ss;
|
||||
uint32_t largest_node_number = 0;
|
||||
|
||||
// Open the directory
|
||||
DIR* dir = opendir(path.c_str());
|
||||
if (!dir) {
|
||||
// Return UINT32_MAX on error
|
||||
ss << __PRETTY_FUNCTION__ << " | Failed to open directory: " << path
|
||||
<< " | errno = " << errno
|
||||
<< " | error = " << strerror(errno);
|
||||
// std::cout << ss.str() << std::endl;
|
||||
LOG_ERROR(ss);
|
||||
return UINT32_MAX;
|
||||
}
|
||||
|
||||
struct dirent* entry;
|
||||
while ((entry = readdir(dir)) != nullptr) {
|
||||
// Skip "." and ".."
|
||||
if (entry->d_name[0] == '.') {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if the directory name is a number
|
||||
std::string dir_name(entry->d_name);
|
||||
if (std::all_of(dir_name.begin(), dir_name.end(), ::isdigit)) {
|
||||
uint32_t node_number = static_cast<uint32_t>(std::stoul(dir_name));
|
||||
largest_node_number = std::max(largest_node_number, node_number);
|
||||
}
|
||||
}
|
||||
|
||||
if (closedir(dir)) {
|
||||
// Return UINT32_MAX on error
|
||||
ss << __PRETTY_FUNCTION__ << " | Failed to close directory: " << path
|
||||
<< " | errno = " << errno
|
||||
<< " | error = " << strerror(errno);
|
||||
// std::cout << ss.str() << std::endl;
|
||||
LOG_ERROR(ss);
|
||||
return UINT32_MAX;
|
||||
}
|
||||
|
||||
return largest_node_number;
|
||||
}
|
||||
|
||||
uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
|
||||
std::string err_msg;
|
||||
uint32_t count = 0;
|
||||
int32_t cardId = 0;
|
||||
int32_t max_cardId = -1;
|
||||
std::ostringstream ss;
|
||||
|
||||
// If this gets called more than once, clear previous findings.
|
||||
devices_.clear();
|
||||
monitors_.clear();
|
||||
|
||||
auto drm_dir = opendir(kPathDRMRoot);
|
||||
if (drm_dir == nullptr) {
|
||||
err_msg = "Failed to open drm root directory ";
|
||||
err_msg += kPathDRMRoot;
|
||||
err_msg += ".";
|
||||
perror(err_msg.c_str());
|
||||
uint32_t max_nodes = GetLargestNodeNumber();
|
||||
ss << __PRETTY_FUNCTION__ << " | Discovered a potential of "
|
||||
<< std::to_string(max_nodes) << " kfd nodes";
|
||||
// std::cout << ss.str() << std::endl;
|
||||
LOG_DEBUG(ss);
|
||||
if (max_nodes == UINT32_MAX) {
|
||||
ss << __PRETTY_FUNCTION__ << " | Failed to get largest node number";
|
||||
// std::cout << ss.str() << std::endl;
|
||||
LOG_ERROR(ss);
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto dentry = readdir(drm_dir);
|
||||
|
||||
while (dentry != nullptr) {
|
||||
if (memcmp(dentry->d_name, kDeviceNamePrefix, strlen(kDeviceNamePrefix))
|
||||
== 0) {
|
||||
if ((strcmp(dentry->d_name, ".") == 0) ||
|
||||
(strcmp(dentry->d_name, "..") == 0))
|
||||
continue;
|
||||
sscanf(&dentry->d_name[strlen(kDeviceNamePrefix)], "%d", &cardId);
|
||||
if (cardId > max_cardId)
|
||||
max_cardId = cardId;
|
||||
count++;
|
||||
}
|
||||
dentry = readdir(drm_dir);
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__ << " | Discovered a potential of "
|
||||
<< std::to_string(count) << " cards" << " | ";
|
||||
LOG_DEBUG(ss);
|
||||
// Iterate through all nodes
|
||||
// and read all properties
|
||||
// under /sys/class/kfd/kfd/topology/nodes/
|
||||
// and add to systemNodes vector
|
||||
|
||||
struct systemNode {
|
||||
uint32_t s_node_id = 0;
|
||||
@@ -741,24 +896,27 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
|
||||
uint8_t s_device = 0;
|
||||
uint8_t s_function = 0;
|
||||
uint8_t s_partition_id = 0;
|
||||
uint32_t s_drm_render_minor = 0;
|
||||
uint64_t padding = 0; // padding added in case new changes in future
|
||||
};
|
||||
// allSystemNodes[key = unique_id] => {node_id, gpu_id, unique_id,
|
||||
// location_id, bdf, domain, bus, device,
|
||||
// partition_id}
|
||||
std::multimap<uint64_t, systemNode> allSystemNodes;
|
||||
std::set<uint32_t> gpuNodeIdsFound;
|
||||
std::vector<systemNode> systemNodes;
|
||||
uint32_t node_id = 0;
|
||||
static const int BYTE = 8;
|
||||
while (true) {
|
||||
uint64_t gpu_id = 0, unique_id = 0, location_id = 0, domain = 0;
|
||||
while (node_id <= max_nodes) {
|
||||
ss << __PRETTY_FUNCTION__ << " | node_id = " << std::to_string(node_id);
|
||||
// std::cout << ss.str() << std::endl;
|
||||
LOG_DEBUG(ss);
|
||||
uint64_t gpu_id = 0, unique_id = 0, location_id = 0, domain = 0, render_d = 0;
|
||||
int ret_gpu_id = get_gpu_id(node_id, &gpu_id);
|
||||
int ret_unique_id = read_node_properties(node_id, "unique_id", &unique_id);
|
||||
int ret_loc_id =
|
||||
read_node_properties(node_id, "location_id", &location_id);
|
||||
int ret_domain = read_node_properties(node_id, "domain", &domain);
|
||||
int ret_renderd = read_node_properties(node_id, "drm_render_minor", &render_d);
|
||||
bool isANode = (ret_gpu_id == 0 &&
|
||||
(ret_domain == 0 && ret_loc_id == 0));
|
||||
(ret_domain == 0 && ret_loc_id == 0 && ret_renderd == 0));
|
||||
ss << __PRETTY_FUNCTION__ << " | isAGpuNode: "
|
||||
<< (isANode ? "TRUE" : "FALSE") << "; is_vm_guest(): "
|
||||
<< (is_vm_guest() ? "TRUE" : "FALSE")
|
||||
@@ -766,11 +924,13 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
|
||||
<< "; ret_domain: " << ret_domain
|
||||
<< "; ret_loc_id: " << ret_loc_id
|
||||
<< "; ret_unique_id: " << ret_unique_id
|
||||
<< "\nret_renderd: " << ret_renderd
|
||||
<< "\n[node_id = " << print_unsigned_hex_and_int(node_id) << "\n"
|
||||
<< "; gpu_id = " << print_unsigned_hex_and_int(gpu_id) << "\n"
|
||||
<< "; unique_id = " << print_unsigned_hex_and_int(unique_id) << "\n"
|
||||
<< "; location_id = " << print_unsigned_hex_and_int(location_id) << "\n"
|
||||
<< "; domain = " << print_unsigned_hex_and_int(domain)
|
||||
<< "; domain = " << print_unsigned_hex_and_int(domain) << "\n"
|
||||
<< "; drm_render_minor = " << print_unsigned_hex_and_int(render_d)
|
||||
<< "]\n";
|
||||
LOG_DEBUG(ss);
|
||||
if (isANode || (is_vm_guest() && ret_gpu_id == 0)) {
|
||||
@@ -783,14 +943,11 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
|
||||
myNode.s_location_id = location_id;
|
||||
myNode.s_domain = domain & 0xFFFFFFFF;
|
||||
myNode.s_bdf = (myNode.s_domain << 32) | (myNode.s_location_id);
|
||||
myNode.s_location_id = myNode.s_bdf;
|
||||
myNode.s_bdf |= ((domain & 0xFFFFFFFF) << 32);
|
||||
myNode.s_location_id = myNode.s_bdf;
|
||||
myNode.s_domain = myNode.s_location_id >> 32;
|
||||
myNode.s_bus = ((myNode.s_location_id >> 8) & 0xFF);
|
||||
myNode.s_device = ((myNode.s_location_id >> 3) & 0x1F);
|
||||
myNode.s_function = myNode.s_location_id & 0x7;
|
||||
myNode.s_partition_id = ((myNode.s_location_id >> 28) & 0xF);
|
||||
myNode.s_drm_render_minor = static_cast<uint32_t>((ret_renderd == 0) ? render_d : 0);
|
||||
if (gpu_id != 0) { // only add gpu nodes, 0 = CPU
|
||||
auto ret = gpuNodeIdsFound.insert(node_id);
|
||||
if (ret.second != false) {
|
||||
@@ -807,292 +964,45 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
|
||||
<< "; unique_id = " << print_unsigned_hex_and_int(unique_id) << "\n"
|
||||
<< "; location_id = " << print_unsigned_hex_and_int(location_id) << "\n"
|
||||
<< "; domain = " << print_unsigned_hex_and_int(domain) << "\n"
|
||||
<< "; bus = " << print_unsigned_hex_and_int(myNode.s_bus) << "\n"
|
||||
<< "; device = " << print_unsigned_hex_and_int(myNode.s_device) << "\n"
|
||||
<< "; function = " << print_unsigned_hex_and_int(myNode.s_function) << "\n"
|
||||
<< "; partition_id = " << print_unsigned_hex_and_int(myNode.s_partition_id) << "\n"
|
||||
<< "; bdf = " << print_unsigned_hex_and_int(myNode.s_bdf) << "\n"
|
||||
<< "; drm_render_minor = " << print_unsigned_hex_and_int(myNode.s_drm_render_minor)
|
||||
<< "]\n";
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
allSystemNodes.emplace(unique_id, myNode);
|
||||
systemNodes.push_back(myNode);
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
node_id++;
|
||||
}
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << " | Ordered system nodes found = {";
|
||||
for (auto i : allSystemNodes) {
|
||||
ss << "\n[node_id = " << std::to_string(i.second.s_node_id)
|
||||
<< "; gpu_id = " << std::to_string(i.second.s_gpu_id)
|
||||
<< "; unique_id = " << std::to_string(i.second.s_unique_id)
|
||||
<< "; location_id = " << std::to_string(i.second.s_location_id)
|
||||
<< "; bdf = " << print_int_as_hex(i.second.s_bdf)
|
||||
<< "; domain = " << print_int_as_hex(i.second.s_domain, true, 2*BYTE)
|
||||
<< "; bus = " << print_int_as_hex(i.second.s_bus, true, BYTE)
|
||||
<< "; device = " << print_int_as_hex(i.second.s_device, true, BYTE)
|
||||
<< "; function = " << std::to_string(i.second.s_function)
|
||||
<< "; partition_id = " << std::to_string(i.second.s_partition_id)
|
||||
<< "], ";
|
||||
|
||||
for (auto i : systemNodes) {
|
||||
ss << "\n[node_id = " << std::to_string(i.s_node_id) << "\n"
|
||||
<< "; gpu_id = " << std::to_string(i.s_gpu_id) << "\n"
|
||||
<< "; unique_id = " << std::to_string(i.s_unique_id) << "\n"
|
||||
<< "; location_id = " << std::to_string(i.s_location_id) << "\n"
|
||||
<< "; bdf = " << print_int_as_hex(i.s_bdf) << "\n"
|
||||
<< "; domain = " << print_int_as_hex(i.s_domain, true, 2*BYTE) << "\n"
|
||||
<< "; bus = " << print_int_as_hex(i.s_bus, true, BYTE) << "\n"
|
||||
<< "; device = " << print_int_as_hex(i.s_device, true, BYTE) << "\n"
|
||||
<< "; function = " << std::to_string(i.s_function) << "\n"
|
||||
<< "; partition_id = " << std::to_string(i.s_partition_id) << "\n"
|
||||
<< "; drm_render_minor = " << std::to_string(i.s_drm_render_minor)
|
||||
<< "], \n";
|
||||
rsmi_device_enumeration_t rsmi_device;
|
||||
rsmi_device.dev_name = "";
|
||||
rsmi_device.bdfid = i.s_bdf;
|
||||
rsmi_device.drm_render_minor = i.s_drm_render_minor;
|
||||
AddToDeviceList2(rsmi_device);
|
||||
}
|
||||
ss << "}";
|
||||
// std::cout << ss.str() << std::endl;
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
uint32_t cardAdded = 0;
|
||||
// Discover all root cards & gpu partitions associated with each
|
||||
for (int32_t cardId = 0; cardId <= max_cardId; cardId++) {
|
||||
std::string path = kPathDRMRoot;
|
||||
path += "/card";
|
||||
path += std::to_string(cardId);
|
||||
uint64_t primary_unique_id = 0;
|
||||
uint64_t device_uuid = 0;
|
||||
bool doesDeviceSupportPartitions = false;
|
||||
// get current partition
|
||||
int kSize = 256;
|
||||
char computePartition[kSize];
|
||||
std::string strCompPartition = "UNKNOWN";
|
||||
uint32_t numMonDevices = 0;
|
||||
rsmi_num_monitor_devices(&numMonDevices);
|
||||
|
||||
// each identified gpu card node is a primary node for
|
||||
// potential matching unique ids
|
||||
if (isAMDGPU(path) ||
|
||||
(init_options_ & RSMI_INIT_FLAG_ALL_GPUS)) {
|
||||
std::string d_name = "card";
|
||||
d_name += std::to_string(cardId);
|
||||
uint32_t numMonDevices = 0;
|
||||
rsmi_num_monitor_devices(&numMonDevices);
|
||||
if (rsmi_dev_compute_partition_get(cardAdded, computePartition, kSize)
|
||||
== RSMI_STATUS_SUCCESS) {
|
||||
strCompPartition = computePartition;
|
||||
doesDeviceSupportPartitions = true;
|
||||
}
|
||||
rsmi_status_t ret_unique_id =
|
||||
rsmi_dev_unique_id_get(cardAdded, &device_uuid);
|
||||
auto temp_numb_nodes = allSystemNodes.count(device_uuid);
|
||||
auto primaryBdfId =
|
||||
allSystemNodes.lower_bound(device_uuid)->second.s_location_id;
|
||||
auto i = allSystemNodes.lower_bound(device_uuid);
|
||||
if (doesDeviceSupportPartitions && temp_numb_nodes > 1
|
||||
&& ret_unique_id == RSMI_STATUS_SUCCESS) {
|
||||
// helps identify xgmi nodes (secondary nodes) easier
|
||||
ss << __PRETTY_FUNCTION__ << " | secondary node add ; "
|
||||
<< " BDF = " << std::to_string(primaryBdfId)
|
||||
<< " (" << print_int_as_hex(primaryBdfId) << ")";
|
||||
LOG_DEBUG(ss);
|
||||
if (doesDeviceSupportPartitions && strCompPartition != "SPX"
|
||||
&& i->second.s_partition_id == 0) {
|
||||
i->second.s_partition_id = i->second.s_function;
|
||||
ss << __PRETTY_FUNCTION__ << " | (secondary node add) fall back - "
|
||||
<< "detected !SPX && partition_id == 0"
|
||||
<< "; function = " << std::to_string(i->second.s_function)
|
||||
<< "; partition_id = " << std::to_string(i->second.s_partition_id);
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | (secondary node add) B4 AddToDeviceList() -->"
|
||||
<< "\n[node_id = " << std::to_string(i->second.s_node_id)
|
||||
<< "; gpu_id = " << std::to_string(i->second.s_gpu_id)
|
||||
<< "; unique_id = " << std::to_string(i->second.s_unique_id)
|
||||
<< "; location_id = " << std::to_string(i->second.s_location_id)
|
||||
<< "; bdf = " << print_int_as_hex(i->second.s_bdf)
|
||||
<< "; domain = " << print_int_as_hex(i->second.s_domain, true, 2*BYTE)
|
||||
<< "; bus = " << print_int_as_hex(i->second.s_bus, true, BYTE)
|
||||
<< "; device = " << print_int_as_hex(i->second.s_device, true, BYTE)
|
||||
<< "; function = " << std::to_string(i->second.s_function)
|
||||
<< "; partition_id = " << std::to_string(i->second.s_partition_id)
|
||||
<< "], ";
|
||||
LOG_DEBUG(ss);
|
||||
ss << __PRETTY_FUNCTION__ << " | AddToDeviceList #1 (secondary node) \n"
|
||||
<< "; bdf: " << print_unsigned_hex_and_int(primaryBdfId) << "\n";
|
||||
LOG_DEBUG(ss);
|
||||
} else {
|
||||
ss << __PRETTY_FUNCTION__ << " | primary node add ; "
|
||||
<< " BDF = " << std::to_string(UINT64_MAX);
|
||||
if (doesDeviceSupportPartitions && strCompPartition != "SPX"
|
||||
&& i->second.s_partition_id == 0) {
|
||||
i->second.s_partition_id = i->second.s_function;
|
||||
ss << __PRETTY_FUNCTION__ << " | (primary node add) fall back - "
|
||||
<< "detected !SPX && partition_id == 0"
|
||||
<< "; function = " << std::to_string(i->second.s_function)
|
||||
<< "; partition_id = " << std::to_string(i->second.s_partition_id);
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
LOG_DEBUG(ss);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | (primary node add) After AddToDeviceList() -->"
|
||||
<< "\n[node_id = " << std::to_string(i->second.s_node_id)
|
||||
<< "; gpu_id = " << std::to_string(i->second.s_gpu_id)
|
||||
<< "; unique_id = " << std::to_string(i->second.s_unique_id)
|
||||
<< "; location_id = " << std::to_string(i->second.s_location_id)
|
||||
<< "; bdf = " << print_int_as_hex(i->second.s_bdf)
|
||||
<< "; domain = " << print_int_as_hex(i->second.s_domain, true, 2*BYTE)
|
||||
<< "; bus = " << print_int_as_hex(i->second.s_bus, true, BYTE)
|
||||
<< "; device = " << print_int_as_hex(i->second.s_device, true, BYTE)
|
||||
<< "; function = " << std::to_string(i->second.s_function)
|
||||
<< "; partition_id = " << std::to_string(i->second.s_partition_id)
|
||||
<< "], ";
|
||||
LOG_DEBUG(ss);
|
||||
ss << __PRETTY_FUNCTION__ << " | AddToDeviceList #2 (primary node) \n"
|
||||
<< "; bdf: " << print_unsigned_hex_and_int(UINT64_MAX) << "\n";
|
||||
LOG_DEBUG(ss);
|
||||
AddToDeviceList(d_name, UINT64_MAX);
|
||||
}
|
||||
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Ordered system nodes seen in lookup = {";
|
||||
for (auto i : allSystemNodes) {
|
||||
ss << "\n[node_id = " << std::to_string(i.second.s_node_id)
|
||||
<< "; gpu_id = " << std::to_string(i.second.s_gpu_id)
|
||||
<< "; unique_id = " << std::to_string(i.second.s_unique_id)
|
||||
<< "; location_id = " << std::to_string(i.second.s_location_id)
|
||||
<< "; bdf = " << print_int_as_hex(i.second.s_bdf)
|
||||
<< "; domain = " << print_int_as_hex(i.second.s_domain, true, 2*BYTE)
|
||||
<< "; bus = " << print_int_as_hex(i.second.s_bus, true, BYTE)
|
||||
<< "; device = " << print_int_as_hex(i.second.s_device, true, BYTE)
|
||||
<< "; function = " << std::to_string(i.second.s_function)
|
||||
<< "; partition_id = " << std::to_string(i.second.s_partition_id)
|
||||
<< "], ";
|
||||
}
|
||||
ss << "}";
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
uint64_t temp_primary_unique_id = 0;
|
||||
if (allSystemNodes.empty()) {
|
||||
cardAdded++;
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | allSystemNodes.empty() = true, continue...";
|
||||
LOG_DEBUG(ss);
|
||||
continue;
|
||||
}
|
||||
|
||||
// get current partition
|
||||
rsmi_num_monitor_devices(&numMonDevices);
|
||||
if (rsmi_dev_compute_partition_get(cardAdded, computePartition, kSize)
|
||||
== RSMI_STATUS_SUCCESS) {
|
||||
strCompPartition = computePartition;
|
||||
}
|
||||
if (rsmi_dev_unique_id_get(cardAdded, &device_uuid)
|
||||
!= RSMI_STATUS_SUCCESS) {
|
||||
cardAdded++;
|
||||
allSystemNodes.erase(device_uuid);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | rsmi_dev_unique_id_get(cardId, &device_uuid)"
|
||||
<< " was not successful, continue.. ";
|
||||
LOG_DEBUG(ss);
|
||||
continue;
|
||||
}
|
||||
|
||||
temp_primary_unique_id =
|
||||
allSystemNodes.find(device_uuid)->second.s_unique_id;
|
||||
temp_numb_nodes = allSystemNodes.count(temp_primary_unique_id);
|
||||
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | device/node id (cardId) = " << std::to_string(cardId)
|
||||
<< " | card id (cardAdded) = " << std::to_string(cardAdded)
|
||||
<< " | numMonDevices = " << std::to_string(numMonDevices)
|
||||
<< " | compute partition = " << strCompPartition
|
||||
<< " | temp_primary_unique_id = "
|
||||
<< std::to_string(temp_primary_unique_id)
|
||||
<< " | Num of nodes matching temp_primary_unique_id = "
|
||||
<< temp_numb_nodes
|
||||
<< " | device_uuid (hex/uint) = "
|
||||
<< print_unsigned_hex_and_int(device_uuid)
|
||||
<< " | device_uuid (uint64_t) = " << device_uuid;
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
if (temp_primary_unique_id != 0) {
|
||||
primary_unique_id = temp_primary_unique_id;
|
||||
} else {
|
||||
cardAdded++;
|
||||
// remove already added nodes associated with current card
|
||||
allSystemNodes.erase(0);
|
||||
continue;
|
||||
}
|
||||
|
||||
auto numb_nodes = allSystemNodes.count(primary_unique_id);
|
||||
ss << __PRETTY_FUNCTION__ << " | REFRESH - primary_unique_id = "
|
||||
<< std::to_string(primary_unique_id) << " has "
|
||||
<< std::to_string(numb_nodes) << " known gpu nodes";
|
||||
LOG_DEBUG(ss);
|
||||
while (numb_nodes > 1) {
|
||||
std::string secNode = "card";
|
||||
secNode += std::to_string(cardId); // maps the primary node card to
|
||||
// secondary - allows get/sets
|
||||
auto it = allSystemNodes.lower_bound(device_uuid);
|
||||
auto it_end = allSystemNodes.upper_bound(device_uuid);
|
||||
if (numb_nodes == temp_numb_nodes) {
|
||||
auto removalNodeId = it->second.s_node_id;
|
||||
auto removalGpuId = it->second.s_gpu_id;
|
||||
auto removalUniqueId = it->second.s_unique_id;
|
||||
auto removalLocId = it->second.s_location_id;
|
||||
auto removaldomain = it->second.s_domain;
|
||||
auto nodesErased = 1;
|
||||
allSystemNodes.erase(it++);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< "\nPRIMARY --> num_nodes == temp_numb_nodes; ERASING "
|
||||
<< std::to_string(nodesErased) << " node -> [node_id = "
|
||||
<< std::to_string(removalNodeId)
|
||||
<< "; gpu_id = " << std::to_string(removalGpuId)
|
||||
<< "; unique_id = " << std::to_string(removalUniqueId)
|
||||
<< "; location_id = " << std::to_string(removalLocId)
|
||||
<< "; removaldomain = " << std::to_string(removaldomain)
|
||||
<< "]";
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
if (it == it_end) {
|
||||
break;
|
||||
}
|
||||
auto myBdfId = it->second.s_location_id;
|
||||
ss << __PRETTY_FUNCTION__ << " | secondary node add #2; "
|
||||
<< " BDF = " << std::to_string(myBdfId)
|
||||
<< " (" << print_int_as_hex(myBdfId) << ")";
|
||||
LOG_DEBUG(ss);
|
||||
if (doesDeviceSupportPartitions && strCompPartition != "SPX"
|
||||
&& it->second.s_partition_id == 0) {
|
||||
it->second.s_partition_id = it->second.s_function;
|
||||
ss << __PRETTY_FUNCTION__ << " | (secondary node add #2) fall back - "
|
||||
<< "detected !SPX && partition_id == 0"
|
||||
<< "; function = " << std::to_string(it->second.s_function)
|
||||
<< "; partition_id = " << std::to_string(it->second.s_partition_id);
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | (secondary node add #2) B4 AddToDeviceList() -->"
|
||||
<< "\n[node_id = " << std::to_string(it->second.s_node_id)
|
||||
<< "; gpu_id = " << std::to_string(it->second.s_gpu_id)
|
||||
<< "; unique_id = " << std::to_string(it->second.s_unique_id)
|
||||
<< "; location_id = " << std::to_string(it->second.s_location_id)
|
||||
<< "; bdf = " << print_int_as_hex(it->second.s_bdf)
|
||||
<< "; domain = " << print_int_as_hex(it->second.s_domain, true, 2*BYTE)
|
||||
<< "; bus = " << print_int_as_hex(it->second.s_bus, true, BYTE)
|
||||
<< "; device = " << print_int_as_hex(it->second.s_device, true, BYTE)
|
||||
<< "; function = " << std::to_string(it->second.s_function)
|
||||
<< "; partition_id = " << std::to_string(it->second.s_partition_id)
|
||||
<< "], ";
|
||||
LOG_DEBUG(ss);
|
||||
ss << __PRETTY_FUNCTION__ << " | AddToDeviceList #3 (secondary node add #2) \n"
|
||||
<< "; bdf: " << print_unsigned_hex_and_int(myBdfId) << "\n";
|
||||
LOG_DEBUG(ss);
|
||||
AddToDeviceList(secNode, myBdfId);
|
||||
allSystemNodes.erase(it++);
|
||||
numb_nodes--;
|
||||
cardAdded++;
|
||||
}
|
||||
// remove any remaining nodes associated with current card
|
||||
auto erasedNodes = allSystemNodes.erase(primary_unique_id);
|
||||
ss << __PRETTY_FUNCTION__ << " | After finding primary_unique_id = "
|
||||
<< std::to_string(primary_unique_id) << " erased "
|
||||
<< std::to_string(erasedNodes) << " nodes";
|
||||
LOG_DEBUG(ss);
|
||||
cardAdded++;
|
||||
}
|
||||
}
|
||||
|
||||
if (closedir(drm_dir)) {
|
||||
err_msg = "Failed to close drm root directory ";
|
||||
err_msg += kPathDRMRoot;
|
||||
err_msg += ".";
|
||||
perror(err_msg.c_str());
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -619,10 +619,10 @@ amdsmi_get_gpu_enumeration_info(amdsmi_processor_handle processor_handle,
|
||||
}
|
||||
|
||||
// Retrieve DRM Card ID
|
||||
info->drm_card = gpu_device->get_card_from_bdf();
|
||||
info->drm_card = gpu_device->get_card_id();
|
||||
|
||||
// Retrieve DRM Render ID
|
||||
info->drm_render = gpu_device->get_render_id();
|
||||
info->drm_render = gpu_device->get_drm_render_minor();
|
||||
|
||||
// Retrieve HIP ID (difference from the smallest node ID) and HSA ID
|
||||
std::map<uint64_t, std::shared_ptr<amd::smi::KFDNode>> nodes;
|
||||
@@ -2267,6 +2267,7 @@ amdsmi_get_gpu_accelerator_partition_profile_config(amdsmi_processor_handle proc
|
||||
<< "\n profile_config->profiles[i].num_resources: "
|
||||
<< profile_config->profiles[i].num_resources
|
||||
<< std::endl;
|
||||
// std::cout << ss.str() << std::endl;
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
|
||||
@@ -2425,6 +2426,7 @@ amdsmi_get_gpu_accelerator_partition_profile_config(amdsmi_processor_handle proc
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | END returning " << smi_amdgpu_get_status_string(return_status, false);
|
||||
// std::cout << ss.str() << std::endl;
|
||||
LOG_INFO(ss);
|
||||
|
||||
return return_status;
|
||||
@@ -2791,6 +2793,9 @@ amdsmi_get_gpu_metrics_header_info(amdsmi_processor_handle processor_handle,
|
||||
{
|
||||
AMDSMI_CHECK_INIT();
|
||||
// nullptr api supported
|
||||
if (header_value != nullptr) {
|
||||
*header_value = amd_metrics_table_header_t{}; // Use a default initializer for the struct
|
||||
}
|
||||
|
||||
return rsmi_wrapper(rsmi_dev_metrics_header_info_get, processor_handle, 0,
|
||||
reinterpret_cast<metrics_table_header_t*>(header_value));
|
||||
@@ -2802,7 +2807,7 @@ amdsmi_status_t amdsmi_get_gpu_metrics_info(
|
||||
AMDSMI_CHECK_INIT();
|
||||
// nullptr api supported
|
||||
if (pgpu_metrics != nullptr) {
|
||||
*pgpu_metrics = {};
|
||||
*pgpu_metrics = amdsmi_gpu_metrics_t{}; // Use a default initializer for the struct
|
||||
}
|
||||
return rsmi_wrapper(rsmi_dev_gpu_metrics_info_get, processor_handle, 0,
|
||||
reinterpret_cast<rsmi_gpu_metrics_t*>(pgpu_metrics));
|
||||
@@ -3805,7 +3810,7 @@ amdsmi_get_gpu_cper_entries(
|
||||
return status;
|
||||
}
|
||||
std::string path = std::string("/sys/kernel/debug/dri/") +
|
||||
std::to_string(gpu_device->get_card_from_bdf()) +
|
||||
std::to_string(gpu_device->get_card_id()) +
|
||||
"/amdgpu_ring_cper";
|
||||
|
||||
|
||||
@@ -3957,6 +3962,7 @@ amdsmi_status_t amdsmi_get_gpu_driver_info(amdsmi_processor_handle processor_han
|
||||
|
||||
amdsmi_status_t amdsmi_get_pcie_info(amdsmi_processor_handle processor_handle, amdsmi_pcie_info_t *info) {
|
||||
AMDSMI_CHECK_INIT();
|
||||
std::ostringstream ss;
|
||||
|
||||
if (info == nullptr) {
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
@@ -3984,7 +3990,10 @@ amdsmi_status_t amdsmi_get_pcie_info(amdsmi_processor_handle processor_handle, a
|
||||
fscanf(fp, "%d", &pcie_width);
|
||||
fclose(fp);
|
||||
} else {
|
||||
printf("Failed to open file: %s \n", path_max_link_width.c_str());
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Failed to open file: " << path_max_link_width
|
||||
<< " | returning AMDSMI_STATUS_API_FAILED";
|
||||
LOG_ERROR(ss);
|
||||
return AMDSMI_STATUS_API_FAILED;
|
||||
}
|
||||
info->pcie_static.max_pcie_width = (uint16_t)pcie_width;
|
||||
|
||||
@@ -42,6 +42,64 @@ uint32_t AMDSmiGPUDevice::get_gpu_id() const {
|
||||
return gpu_id_;
|
||||
}
|
||||
|
||||
uint32_t AMDSmiGPUDevice::get_card_id() {
|
||||
std::ostringstream ss;
|
||||
// Should never return not_supported, but just in case
|
||||
rsmi_status_t ret = rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED;
|
||||
uint32_t gpu_index = this->get_gpu_id();
|
||||
rsmi_device_identifiers_t identifiers = rsmi_device_identifiers_t{};
|
||||
ret = rsmi_dev_device_identifiers_get(gpu_index, &identifiers);
|
||||
if (ret != rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
this->card_index_ = std::numeric_limits<uint32_t>::max();
|
||||
} else {
|
||||
this->card_index_ = identifiers.card_index;
|
||||
}
|
||||
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | rsmi_dev_identifiers_get status: " << getRSMIStatusString(ret, false) << "\n"
|
||||
<< " | gpu_id_: " << gpu_id_ << "\n"
|
||||
<< " | identifiers.card_index: " << identifiers.card_index << "\n"
|
||||
<< " | identifiers.drm_render_minor: " << identifiers.drm_render_minor << "\n"
|
||||
<< " | identifiers.bdfid: " << std::hex << "0x" << identifiers.bdfid << "\n"
|
||||
<< " | identifiers.kfd_gpu_id: " << std::dec << identifiers.kfd_gpu_id << "\n"
|
||||
<< " | identifiers.partition_id: " << identifiers.partition_id << "\n"
|
||||
<< " | identifiers.smi_device_id: " << identifiers.smi_device_id << "\n"
|
||||
<< " | returning card_index_: "
|
||||
<< this->card_index_ << std::endl;
|
||||
// std::cout << ss.str();
|
||||
LOG_DEBUG(ss);
|
||||
return this->card_index_;
|
||||
}
|
||||
|
||||
uint32_t AMDSmiGPUDevice::get_drm_render_minor() {
|
||||
std::ostringstream ss;
|
||||
// Should never return not_supported, but just in case
|
||||
rsmi_status_t ret = rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED;
|
||||
uint32_t gpu_index = this->get_gpu_id();
|
||||
rsmi_device_identifiers_t identifiers = rsmi_device_identifiers_t{};
|
||||
ret = rsmi_dev_device_identifiers_get(gpu_index, &identifiers);
|
||||
if (ret != rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
this->drm_render_minor_ = std::numeric_limits<uint32_t>::max();
|
||||
} else {
|
||||
this->drm_render_minor_ = identifiers.drm_render_minor;
|
||||
}
|
||||
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | rsmi_dev_identifiers_get status: " << getRSMIStatusString(ret, false) << "\n"
|
||||
<< " | gpu_id_: " << gpu_id_ << "\n"
|
||||
<< " | identifiers.card_index: " << identifiers.card_index << "\n"
|
||||
<< " | identifiers.drm_render_minor: " << identifiers.drm_render_minor << "\n"
|
||||
<< " | identifiers.bdfid: " << std::hex << "0x" << identifiers.bdfid << "\n"
|
||||
<< " | identifiers.kfd_gpu_id: " << std::dec << identifiers.kfd_gpu_id << "\n"
|
||||
<< " | identifiers.partition_id: " << identifiers.partition_id << "\n"
|
||||
<< " | identifiers.smi_device_id: " << identifiers.smi_device_id << "\n"
|
||||
<< " | returning drm_render_minor_: "
|
||||
<< this->drm_render_minor_ << std::endl;
|
||||
// std::cout << ss.str();
|
||||
LOG_DEBUG(ss);
|
||||
return this->drm_render_minor_;
|
||||
}
|
||||
|
||||
uint32_t AMDSmiGPUDevice::get_gpu_fd() const {
|
||||
return fd_;
|
||||
}
|
||||
@@ -323,81 +381,6 @@ std::string AMDSmiGPUDevice::bdf_to_string() const {
|
||||
}
|
||||
|
||||
|
||||
uint32_t AMDSmiGPUDevice::get_card_from_bdf() const {
|
||||
const std::string drm_path = "/sys/class/drm/";
|
||||
|
||||
DIR* dir = opendir(drm_path.c_str());
|
||||
if (!dir) {
|
||||
return std::numeric_limits<uint32_t>::max();
|
||||
}
|
||||
|
||||
struct dirent* entry;
|
||||
while ((entry = readdir(dir)) != nullptr) {
|
||||
std::string device_name = entry->d_name;
|
||||
|
||||
// Check if the entry starts with "card"
|
||||
if (device_name.find("card") == 0) {
|
||||
const std::string card_path = drm_path + device_name + "/device";
|
||||
|
||||
// Open the uevent file for the device
|
||||
std::ifstream uevent_file(card_path + "/uevent");
|
||||
if (!uevent_file) {
|
||||
continue; // Skip if the file is not found
|
||||
}
|
||||
|
||||
std::string line;
|
||||
while (std::getline(uevent_file, line)) {
|
||||
// Check for the PCI_SLOT_NAME and if it contains the BDF
|
||||
if (line.rfind("PCI_SLOT_NAME", 0) == 0 && line.find(bdf_to_string()) != std::string::npos) {
|
||||
closedir(dir);
|
||||
return std::stoi(device_name.substr(4)); // Convert extracted number to int
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
closedir(dir);
|
||||
return std::numeric_limits<uint32_t>::max(); // Return -1 if no matching card is found
|
||||
}
|
||||
|
||||
uint32_t AMDSmiGPUDevice::get_render_id() const {
|
||||
const std::string drm_path = "/sys/class/drm/";
|
||||
|
||||
DIR* dir = opendir(drm_path.c_str());
|
||||
if (!dir) {
|
||||
return std::numeric_limits<uint32_t>::max();
|
||||
}
|
||||
|
||||
struct dirent* entry;
|
||||
while ((entry = readdir(dir)) != nullptr) {
|
||||
std::string device_name = entry->d_name;
|
||||
|
||||
// Check if the entry starts with "renderD"
|
||||
if (device_name.find("renderD") == 0) {
|
||||
const std::string render_path = drm_path + device_name + "/device";
|
||||
|
||||
// Open the uevent file for the device
|
||||
std::ifstream uevent_file(render_path + "/uevent");
|
||||
if (!uevent_file) {
|
||||
continue; // Skip if the file is not found
|
||||
}
|
||||
|
||||
std::string line;
|
||||
while (std::getline(uevent_file, line)) {
|
||||
// Check for the PCI_SLOT_NAME and if it contains the BDF
|
||||
if (line.rfind("PCI_SLOT_NAME", 0) == 0 && line.find(bdf_to_string()) != std::string::npos) {
|
||||
closedir(dir);
|
||||
return std::stoi(device_name.substr(7)); // Extract only the number after "renderD"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
closedir(dir);
|
||||
return std::numeric_limits<uint32_t>::max(); // Return -1 if no matching render ID is found
|
||||
}
|
||||
|
||||
|
||||
} // namespace smi
|
||||
} // namespace amd
|
||||
|
||||
|
||||
@@ -115,7 +115,7 @@ int openFileAndModifyBuffer(std::string path, char *buff, size_t sizeOfBuff,
|
||||
bool errorDiscovered = false;
|
||||
std::ifstream file(path, std::ifstream::in);
|
||||
std::string contents = {std::istreambuf_iterator<char>{file}, std::istreambuf_iterator<char>{}};
|
||||
clearCharBufferAndReinitialize(buff, sizeOfBuff, contents);
|
||||
clearCharBufferAndReinitialize(buff, static_cast<uint32_t>(sizeOfBuff), contents);
|
||||
if (!file.is_open()) {
|
||||
errorDiscovered = true;
|
||||
} else {
|
||||
@@ -453,21 +453,12 @@ amdsmi_status_t smi_amdgpu_get_bad_page_info(amd::smi::AMDSmiGPUDevice* device,
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static uint32_t GetDeviceIndex(const std::string s) {
|
||||
std::string t = s;
|
||||
size_t tmp = t.find_last_not_of("0123456789");
|
||||
t.erase(0, tmp+1);
|
||||
|
||||
assert(stoi(t) >= 0);
|
||||
return static_cast<uint32_t>(stoi(t));
|
||||
}
|
||||
|
||||
amdsmi_status_t smi_amdgpu_get_bad_page_threshold(amd::smi::AMDSmiGPUDevice* device,
|
||||
uint32_t *threshold) {
|
||||
SMIGPUDEVICE_MUTEX(device->get_mutex())
|
||||
|
||||
//TODO: Accessing the node requires root privileges, and its interface may need to be exposed in another path
|
||||
uint32_t index = GetDeviceIndex(device->get_gpu_path());
|
||||
uint32_t index = device->get_card_id();
|
||||
std::string fullpath = "/sys/kernel/debug/dri/" + std::to_string(index) + std::string("/ras/bad_page_cnt_threshold");
|
||||
std::ifstream fs(fullpath.c_str());
|
||||
|
||||
@@ -489,7 +480,6 @@ amdsmi_status_t smi_amdgpu_get_bad_page_threshold(amd::smi::AMDSmiGPUDevice* dev
|
||||
amdsmi_status_t smi_amdgpu_validate_ras_eeprom(amd::smi::AMDSmiGPUDevice* device) {
|
||||
SMIGPUDEVICE_MUTEX(device->get_mutex())
|
||||
|
||||
//uint32_t index = GetDeviceIndex(device->get_gpu_path());
|
||||
//TODO: need to expose the corresponding interface to validate the checksum of ras eeprom table.
|
||||
//verify fail: return AMDSMI_STATUS_CORRUPTED_EEPROM
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
|
||||
@@ -297,7 +297,7 @@ void TestMemoryPartitionReadWrite::Run(void) {
|
||||
<< "\n\t\tcurrent_profile.num_resources: " << current_profile.num_resources
|
||||
<< std::endl;
|
||||
}
|
||||
for (auto j = 0; j < current_profile.num_resources; j++) {
|
||||
for (uint32_t j = 0; j < current_profile.num_resources; j++) {
|
||||
auto rp = profile_config.resource_profiles[resource_index];
|
||||
|
||||
IF_VERB(STANDARD) {
|
||||
|
||||
Ссылка в новой задаче
Block a user